%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.96", %%% date = "03 October 2025", %%% time = "17:09:38 MDT", %%% filename = "vldbe.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% URL = "https://www.math.utah.edu/~beebe", %%% checksum = "04668 151564 761291 7204467", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "BibTeX; bibliography; Very Large Data Bases; %%% Proceedings of the VLDB Endowment", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is a COMPLETE bibliography of %%% publications in the Proceedings of the VLDB %%% Endowment (CODEN unknown, ISSN 2150-8097). %%% %%% The journal has a Web site at %%% %%% http://portal.acm.org/citation.cfm?id=J1174 %%% %%% At version 1.96, the year coverage looked %%% like this: %%% %%% 2008 ( 169) 2014 ( 230) 2020 ( 234) %%% 2009 ( 167) 2015 ( 224) 2021 ( 345) %%% 2010 ( 193) 2016 ( 183) 2022 ( 357) %%% 2011 ( 75) 2017 ( 207) 2023 ( 372) %%% 2012 ( 187) 2018 ( 201) 2024 ( 447) %%% 2013 ( 238) 2019 ( 230) 2025 ( 316) %%% %%% Article: 4375 %%% %%% Total entries: 4375 %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility.", %%% } %%% ==================================================================== @Preamble{ "\ifx \undefined \circled \def \circled #1{(#1)}\fi" # "\ifx \undefined \k \let \k = \c \fi" # "\ifx \undefined \ocirc \def \ocirc #1{{\accent'27#1}}\fi" # "\ifx \undefined \pkg \def \pkg #1{{{\tt #1}}} \fi" # "\ifx \undefined \reg \def \reg {\circled{R}}\fi" } %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|https://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Journal abbreviations: @String{j-PROC-VLDB-ENDOWMENT = "Proceedings of the VLDB Endowment"} %%% ==================================================================== %%% Bibliography entries, sorted in publication order: @Article{Hill:2008:TMO, author = "Mark D. Hill", title = "Is transactional memory an oxymoron?", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1--1", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453858", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zobel:2008:DSH, author = "Justin Zobel", title = "Databases and the silification of health", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "2--2", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453859", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Blott:2008:WWH, author = "Stephen Blott and Roger Weber", title = "What's wrong with high-dimensional similarity search?", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "3--3", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453861", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bruno:2008:CPD, author = "Nicolas Bruno and Surajit Chaudhuri", title = "Constrained physical design tuning", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "4--15", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453863", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kementsietsidis:2008:SMQ, author = "Anastasios Kementsietsidis and Frank Neven and Dieter Van de Craen and Stijn Vansummeren", title = "Scalable multi-query optimization for exploratory queries over federated scientific databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "16--27", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453864", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{DeWitt:2008:CIC, author = "David J. DeWitt and Erik Paulson and Eric Robinson and Jeffrey Naughton and Joshua Royalty and Srinath Shankar and Andrew Krioukov", title = "{Clustera}: an integrated computation and data management system", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "28--41", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453865", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cheung:2008:PPE, author = "Alvin Cheung and Samuel Madden", title = "Performance profiling with {EndoScope}, an acquisitional software monitoring framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "42--53", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453866", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bar-Yossef:2008:MSE, author = "Ziv Bar-Yossef and Maxim Gurevich", title = "Mining search engine query logs via suggestion sampling", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "54--65", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453868", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Akdere:2008:PBC, author = "Mert Akdere and U{\u{g}}ur {\c{C}}etintemel and Nesime Tatbul", title = "Plan-based complex event detection across distributed sources", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "66--77", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453869", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lachmann:2008:FRP, author = "Alexander Lachmann and Mirek Riedewald", title = "Finding relevant patterns in bursty sequences", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "78--89", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453870", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cheng:2008:CLW, author = "Hao Cheng and Kien A. Hua and Khanh Vu", title = "Constrained locally weighted clustering", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "90--101", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453871", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hay:2008:RSR, author = "Michael Hay and Gerome Miklau and David Jensen and Don Towsley and Philipp Weis", title = "Resisting structural re-identification in anonymized social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "102--114", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453873", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Terrovitis:2008:PPA, author = "Manolis Terrovitis and Nikos Mamoulis and Panos Kalnis", title = "Privacy-preserving anonymization of set-valued data", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "115--125", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453874", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pang:2008:AQR, author = "HweeHwa Pang and Kyriakos Mouratidis", title = "Authenticating the query results of text search engines", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "126--137", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453875", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kundu:2008:SST, author = "Ashish Kundu and Elisa Bertino", title = "Structural signatures for tree data structures", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "138--150", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453876", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Roitman:2008:MDC, author = "Haggai Roitman and David Carmel and Elad Yom-Tov", title = "Maintaining dynamic channel profiles on the {Web}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "151--162", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453878", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2008:WDD, author = "Fan Yang and Nitin Gupta and Chavdar Botev and Elizabeth F. Churchill and George Levchenko and Jayavel Shanmugasundaram", title = "{WYSIWYG} development of data driven {Web} applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "163--175", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453879", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Baykan:2008:WPL, author = "Eda Baykan and Monika Henzinger and Ingmar Weber", title = "{Web} page language identification based on {URLs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "176--187", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453880", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Han:2008:PQO, author = "Wook-Shin Han and Wooseong Kwak and Jinsoo Lee and Guy M. Lohman and Volker Markl", title = "Parallelizing query optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "188--200", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453882", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hadjieleftheriou:2008:HSS, author = "Marios Hadjieleftheriou and Xiaohui Yu and Nick Koudas and Divesh Srivastava", title = "Hashed samples: selectivity estimators for set similarity selection queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "201--212", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453883", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cohen:2008:TEU, author = "Edith Cohen and Haim Kaplan", title = "Tighter estimation using bottom $k$ sketches", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "213--229", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453884", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Alexe:2008:STB, author = "Bogdan Alexe and Wang-Chiew Tan and Yannis Velegrakis", title = "{STBenchmark}: towards a benchmark for mapping systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "230--244", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453886", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Katsis:2008:ISR, author = "Yannis Katsis and Alin Deutsch and Yannis Papakonstantinou", title = "Interactive source registration in community-oriented information integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "245--259", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453887", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hernandez:2008:DED, author = "Mauricio A. Hern{\'a}ndez and Paolo Papotti and Wang-Chiew Tan", title = "Data exchange with data-metadata translations", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "260--273", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453888", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2008:OPN, author = "Jin Li and Kristin Tufte and Vladislav Shkapenyuk and Vassilis Papadimos and Theodore Johnson and David Maier", title = "Out-of-order processing: a new architecture for high-performance stream systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "274--288", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453890", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Han:2008:SET, author = "Wook-Shin Han and Haifeng Jiang and Howard Ho and Quanzhong Li", title = "{StreamTX}: extracting tuples from streaming {XML} data", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "289--300", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453891", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jin:2008:SWT, author = "Cheqing Jin and Ke Yi and Lei Chen and Jeffrey Xu Yu and Xuemin Lin", title = "Sliding-window top-$k$ queries on uncertain streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "301--312", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453892", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Koch:2008:CPD, author = "Christoph Koch and Dan Olteanu", title = "Conditioning probabilistic databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "313--325", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453894", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Beskales:2008:EST, author = "George Beskales and Mohamed A. Soliman and Ihab F. Ilyas", title = "Efficient search for the top-$k$ probable nearest neighbors in uncertain databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "326--339", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453895", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2008:BML, author = "Daisy Zhe Wang and Eirinaios Michelakis and Minos Garofalakis and Joseph M. Hellerstein", title = "{BayesStore}: managing large, uncertain data repositories with probabilistic graphical models", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "340--351", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453896", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deutch:2008:TIT, author = "Daniel Deutch and Tova Milo", title = "Type inference and type checking for queries on execution traces", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "352--363", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453898", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shang:2008:TVH, author = "Haichuan Shang and Ying Zhang and Xuemin Lin and Jeffrey Xu Yu", title = "Taming verification hardness: an efficient algorithm for testing subgraph isomorphism", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "364--375", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453899", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Golab:2008:GNO, author = "Lukasz Golab and Howard Karloff and Flip Korn and Divesh Srivastava and Bei Yu", title = "On generating near-optimal tableaux for conditional functional dependencies", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "376--390", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453900", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2008:PFD, author = "Wenfei Fan and Shuai Ma and Yanli Hu and Jie Liu and Yinghui Wu", title = "Propagating functional dependencies with conditions", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "391--407", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453901", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Antonellis:2008:SQR, author = "Ioannis Antonellis and Hector Garcia Molina and Chi Chao Chang", title = "{Simrank++}: query rewriting through link analysis of the click graph", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "408--421", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453903", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lizorkin:2008:AEO, author = "Dmitry Lizorkin and Pavel Velikhov and Maxim Grinev and Denis Turdakov", title = "Accuracy estimate and optimization techniques for {SimRank} computation", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "422--433", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453904", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chandramouli:2008:EES, author = "Badrish Chandramouli and Jun Yang", title = "End-to-end support for joins in large-scale publish\slash subscribe systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "434--450", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453905", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Machanavajjhala:2008:SRP, author = "Ashwin Machanavajjhala and Erik Vee and Minos Garofalakis and Jayavel Shanmugasundaram", title = "Scalable ranked publish\slash subscribe", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "451--462", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453906", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Teubner:2008:DCF, author = "Jens Teubner and Torsten Grust and Sebastian Maneth and Sherif Sakr", title = "Dependable cardinality forecasts for {XQuery}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "463--477", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453908", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2008:HBS, author = "Hongzhi Wang and Jianzhong Li and Jizhou Luo and Hong Gao", title = "Hash-base subgraph query processing method for graph-structured {XML} documents", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "478--489", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453909", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cohen:2008:GXS, author = "Sara Cohen", title = "Generating {XML} structure using examples and constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "490--501", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453910", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Holloway:2008:ROD, author = "Allison L. Holloway and David J. DeWitt", title = "Read-optimized databases, in depth", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "502--513", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453912", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Koltsidas:2008:FSL, author = "Ioannis Koltsidas and Stratis D. Viglas", title = "Flashing up the storage layer", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "514--525", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453913", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sears:2008:RCL, author = "Russell Sears and Mark Callaghan and Eric Brewer", title = "{Rose}: compressed, log-structured replication", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "526--537", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453914", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cafarella:2008:WEP, author = "Michael J. Cafarella and Alon Halevy and Daisy Zhe Wang and Eugene Wu and Yang Zhang", title = "{WebTables}: exploring the power of tables on the {Web}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "538--549", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453916", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Garrod:2008:SQR, author = "Charles Garrod and Amit Manjhi and Anastasia Ailamaki and Bruce Maggs and Todd Mowry and Christopher Olston and Anthony Tomasic", title = "Scalable query result caching for {Web} applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "550--561", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453917", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Braga:2008:OMD, author = "Daniele Braga and Stefano Ceri and Florian Daniel and Davide Martinenghi", title = "Optimization of multi-domain queries on the {Web}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "562--573", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453918", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kwon:2008:FTS, author = "YongChul Kwon and Magdalena Balazinska and Albert Greenberg", title = "Fault-tolerant stream processing using a distributed, replicated file system", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "574--585", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453920", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yeh:2008:LLW, author = "Mi-Yen Yeh and Kun-Lung Wu and Philip S. Yu and Ming-Syan Chen", title = "{LeeWave}: level-wise distribution of wavelet coefficients for processing $k$ {NN} queries over distributed streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "586--597", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453921", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Aguilera:2008:PSD, author = "Marcos K. Aguilera and Wojciech Golab and Mehul A. Shah", title = "A practical scalable distributed {B-tree}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "598--609", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453922", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qiao:2008:MMS, author = "Lin Qiao and Vijayshankar Raman and Frederick Reiss and Peter J. Haas and Guy M. Lohman", title = "Main-memory scan sharing for multi-core {CPUs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "610--621", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453924", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Johnson:2008:RWP, author = "Ryan Johnson and Vijayshankar Raman and Richard Sidle and Garret Swart", title = "Row-wise parallel predicate evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "622--634", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453925", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Soundararajan:2008:DPC, author = "Gokul Soundararajan and Jin Chen and Mohamed A. Sharaf and Cristiana Amza", title = "Dynamic partitioning of the cache hierarchy in shared data centers", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "635--646", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453926", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Neumann:2008:RRS, author = "Thomas Neumann and Gerhard Weikum", title = "{RDF-3X}: a {RISC}-style engine for {RDF}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "647--659", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453927", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Simitsis:2008:MCE, author = "Alkis Simitsis and Akanksha Baid and Yannis Sismanis and Berthold Reinwald", title = "Multidimensional content {eXploration}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "660--671", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453929", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fontoura:2008:RTS, author = "Marcus Fontoura and Vanja Josifovski and Ravi Kumar and Christopher Olston and Andrew Tomkins and Sergei Vassilvitskii", title = "Relaxation in text search using taxonomies", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "672--683", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453930", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nguyen:2008:LEF, author = "Hoa Nguyen and Thanh Nguyen and Juliana Freire", title = "Learning to extract form labels", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "684--694", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453931", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jayapandian:2008:ACF, author = "Magesh Jayapandian and H. V. Jagadish", title = "Automated creation of a forms-based database query interface", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "695--709", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453932", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yahia:2008:ENA, author = "Sihem Amer Yahia and Michael Benedikt and Laks V. S. Lakshmanan and Julia Stoyanovich", title = "Efficient network aware search in collaborative tagging sites", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "710--721", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453934", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cheng:2008:CUD, author = "Reynold Cheng and Jinchuan Chen and Xike Xie", title = "Cleaning uncertain data with quality guarantees", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "722--735", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453935", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2008:PNA, author = "Jiansheng Huang and Ting Chen and AnHai Doan and Jeffrey F. Naughton", title = "On the provenance of non-answers to queries over extracted data", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "736--747", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453936", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhu:2008:DAP, author = "Shenghuo Zhu and Tao Li and Zhiyuan Chen and Dingding Wang and Yihong Gong", title = "Dynamic active probing of helpdesk databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "748--760", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453937", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Curino:2008:GDS, author = "Carlo A. Curino and Hyun J. Moon and Carlo Zaniolo", title = "Graceful database schema evolution: the {PRISM} workbench", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "761--772", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453939", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chai:2008:ARD, author = "Xiaoyong Chai and Mayssam Sayyadian and AnHai Doan and Arnon Rosenthal and Len Seligman", title = "Analyzing and revising data integration schemas to improve their matchability", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "773--784", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453940", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Talukdar:2008:LCD, author = "Partha Pratim Talukdar and Marie Jacob and Muhammad Salman Mehmood and Koby Crammer and Zachary G. Ives and Fernando Pereira and Sudipto Guha", title = "Learning to create data-integrating queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "785--796", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453941", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Re:2008:ALP, author = "Christopher R{\'e} and Dan Suciu", title = "Approximate lineage for probabilistic databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "797--808", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453943", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sen:2008:ESC, author = "Prithviraj Sen and Amol Deshpande and Lise Getoor", title = "Exploiting shared correlations in probabilistic databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "809--820", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453944", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rastogi:2008:ACU, author = "Vibhor Rastogi and Dan Suciu and Evan Welbourne", title = "Access control over uncertain data", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "821--832", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453945", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cormode:2008:ABG, author = "Graham Cormode and Divesh Srivastava and Ting Yu and Qing Zhang", title = "Anonymizing bipartite graph data using safe groupings", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "833--844", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453947", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bu:2008:PPS, author = "Yingyi Bu and Ada Wai Chee Fu and Raymond Chi Wing Wong and Lei Chen and Jiuyong Li", title = "Privacy preserving serial data publishing by role composition", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "845--856", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453948", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xiao:2008:OPQ, author = "Xiaokui Xiao and Yufei Tao", title = "Output perturbation with query relaxation", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "857--869", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453949", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lomet:2008:TTI, author = "David Lomet and Mingsheng Hong and Rimma Nehme and Rui Zhang", title = "Transaction time indexing with version compression", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "870--881", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453951", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Moon:2008:MQT, author = "Hyun J. Moon and Carlo A. Curino and Alin Deutsch and Chien-Yi Hou and Carlo Zaniolo", title = "Managing and querying transaction-time databases under schema evolution", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "882--895", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453952", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sherkat:2008:EST, author = "Reza Sherkat and Davood Rafiei", title = "On efficiently searching trajectories and archival data for historical similarities", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "896--908", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453953", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pu:2008:KQC, author = "Ken Q. Pu and Xiaohui Yu", title = "Keyword query cleaning", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "909--920", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453955", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2008:RIR, author = "Ziyang Liu and Yi Cher", title = "Reasoning and identifying relevant matches for {XML} keyword search", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "921--932", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453956", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xiao:2008:EJE, author = "Chuan Xiao and Wei Wang and Xuemin Lin", title = "{Ed-Join}: an efficient algorithm for similarity joins with edit distance constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "933--944", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453957", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Agrawal:2008:SAH, author = "Sanjay Agrawal and Kaushik Chakrabarti and Surajit Chaudhuri and Venkatesh Ganti", title = "Scalable ad-hoc entity extraction from text collections", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "945--957", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453958", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Agrawal:2008:SSS, author = "Parag Agrawal and Daniel Kifer and Christopher Olston", title = "Scheduling shared scans of large data files", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "958--969", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453960", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nath:2008:OMV, author = "Suman Nath and Phillip B. Gibbons", title = "Online maintenance of very large random samples on flash storage", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "970--983", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453961", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ge:2008:SLA, author = "Tingjian Ge and Stan Zdonik", title = "A skip-list approach for efficiently processing forecasting queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "984--995", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453962", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Phan:2008:RRF, author = "Thomas Phan and Wen-Syan Li", title = "A request-routing framework for {SOA}-based enterprise computing", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "996--1007", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453963", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Weiss:2008:HSI, author = "Cathrin Weiss and Panagiotis Karras and Abraham Bernstein", title = "{Hexastore}: sextuple indexing for {Semantic Web} data management", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1008--1019", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453965", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shahabi:2008:ILS, author = "Cyrus Shahabi and Lu-An Tang and Songhua Xing", title = "Indexing land surface for efficient {kNN} query", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1020--1031", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453966", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wong:2008:ESQ, author = "Raymond Chi-Wing Wong and Ada Wai-Chee Fu and Jian Pei and Yip Sing Ho and Tai Wong and Yubao Liu", title = "Efficient skyline querying with variable user preferences on nominal attributes", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1032--1043", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453967", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Guo:2008:ETP, author = "Lin Guo and Sihem Amer Yahia and Raghu Ramakrishnan and Jayavel Shanmugasundaram and Utkarsh Srivastava and Erik Vee", title = "Efficient top-$k$ processing over query-dependent functions", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1044--1055", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453968", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2008:FER, author = "Wei Wu and Fei Yang and Chee-Yong Chan and Kian-Lee Tan", title = "{FINCH}: evaluating reverse $k$-Nearest-Neighbor queries on location data", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1056--1067", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453970", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jeung:2008:DCT, author = "Hoyoung Jeung and Man Lung Yiu and Xiaofang Zhou and Christian S. Jensen and Heng Tao Shen", title = "Discovery of convoys in trajectory databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1068--1080", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453971", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lee:2008:TTC, author = "Jae-Gil Lee and Jiawei Han and Xiaolei Li and Hector Gonzalez", title = "{TraClass}: trajectory classification using hierarchical region-based and trajectory-based clustering", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1081--1094", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453972", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nutanong:2008:VDQ, author = "Sarana Nutanong and Rui Zhang and Egemen Tanin and Lars Kulik", title = "The {V*-Diagram}: a query-dependent approach to moving {KNN} queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1095--1106", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453973", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Guravannavar:2008:RPB, author = "Ravindra Guravannavar and S. Sudarshan", title = "Rewriting procedures for batched bindings", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1107--1123", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453975", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{D:2008:IRP, author = "Harish D. and Pooja N. Darera and Jayant R. Haritsa", title = "Identifying robust plans through plan diagram reduction", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1124--1140", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453976", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chaudhuri:2008:PYG, author = "Surajit Chaudhuri and Vivek Narasayya and Ravi Ramamurthy", title = "A pay-as-you-go framework for query execution feedback", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1141--1152", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453977", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Condie:2008:ERM, author = "Tyson Condie and David Chu and Joseph M. Hellerstein and Petros Maniatis", title = "Evita raced: metacompilation for declarative networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1153--1165", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453978", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chiang:2008:DDQ, author = "Fei Chiang and Ren{\'e}e J. Miller", title = "Discovering data quality rules", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1166--1177", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453980", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2008:MNR, author = "Xiang Zhang and Feng Pan and Wei Wang and Andrew Nobel", title = "Mining non-redundant high order correlations in binary data", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1178--1188", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453981", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dalvi:2008:KSE, author = "Bhavana Bharat Dalvi and Meghana Kshirsagar and S. Sudarshan", title = "Keyword search on external memory data graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1189--1204", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453982", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Koltsidas:2008:SHD, author = "Ioannis Koltsidas and Heiko M{\"u}ller and Stratis D. Viglas", title = "Sorting hierarchical data in external memory for archiving", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "1", pages = "1205--1216", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1453856.1453983", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:36 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Metwally:2008:SSP, author = "Ahmed Metwally and Fatih Emek{\c{c}}i and Divyakant Agrawal and Amr {El Abbadi}", title = "{SLEUTH}: {Single-pubLisher attack dEtection Using correlaTion Hunting}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1217--1228", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454161", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Poess:2008:ECK, author = "Meikel Poess and Raghunath Othayoth Nambiar", title = "Energy cost, the key challenge of today's data centers: a power consumption analysis of {TPC}-{C} results", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1229--1240", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454162", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Madhavan:2008:GDW, author = "Jayant Madhavan and David Ko and Lucja Kot and Vignesh Ganapathy and Alex Rasmussen and Alon Halevy", title = "{Google}'s {Deep Web} crawl", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1241--1252", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454163", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Weis:2008:ISD, author = "Melanie Weis and Felix Naumann and Ulrich Jehle and Jens Lufter and Holger Schuster", title = "Industry-scale duplicate detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1253--1264", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454165", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chaiken:2008:SEE, author = "Ronnie Chaiken and Bob Jenkins and Per-{\AA}ke Larson and Bill Ramsey and Darren Shakib and Simon Weaver and Jingren Zhou", title = "{SCOPE}: easy and efficient parallel processing of massive data sets", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1265--1276", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454166", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cooper:2008:PYH, author = "Brian F. Cooper and Raghu Ramakrishnan and Utkarsh Srivastava and Adam Silberstein and Philip Bohannon and Hans-Arno Jacobsen and Nick Puz and Daniel Weaver and Ramana Yerneni", title = "{PNUTS}: {Yahoo!}'s hosted data serving platform", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1277--1288", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454167", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Acharya:2008:RSF, author = "Srini Acharya and Peter Carlin and Cesar Galindo-Legaria and Krzysztof Kozielczyk and Pawel Terlecki and Peter Zabback", title = "Relational support for flexible schema scenarios", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1289--1300", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454169", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mukherjee:2008:OSS, author = "Niloy Mukherjee and Bharath Aleti and Amit Ganesh and Krishna Kunchithapadam and Scott Lynn and Sujatha Muthulingam and Kam Shergill and Shaoyu Wang and Wei Zhang", title = "{Oracle SecureFiles System}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1301--1312", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454170", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chhugani:2008:EIS, author = "Jatin Chhugani and Anthony D. Nguyen and Victor W. Lee and William Macy and Mostafa Hagog and Yen-Kuang Chen and Akram Baransi and Sanjeev Kumar and Pradeep Dubey", title = "Efficient implementation of sorting on multi-core {SIMD CPU} architecture", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1313--1324", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454171", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dey:2008:EAQ, author = "Atreyee Dey and Sourjya Bhaumik and Harish D. and Jayant R. Haritsa", title = "Efficiently approximating query optimizer plan diagrams", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1325--1336", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454173", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Slezak:2008:BAD, author = "Dominik {\'S}l{\k{e}}zak and Jakub Wr{\'o}blewski and Victoria Eastwood and Piotr Synak", title = "{Brighthouse}: an analytic data warehouse for ad-hoc queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1337--1345", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454174", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ziauddin:2008:OPC, author = "Mohamed Ziauddin and Dinesh Das and Hong Su and Yali Zhu and Khaled Yagoub", title = "Optimizer plan change management: improved stability and performance in {Oracle} 11g", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1346--1355", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454175", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2008:TPX, author = "Zhen Hua Liu and Sivasankaran Chandrasekar and Thomas Baby and Hui J. Chang", title = "Towards a physical {XML} independent {XQuery\slash SQL\slash XML} engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1356--1367", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454177", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lee:2008:CQP, author = "Allison W. Lee and Mohamed Zait", title = "Closing the query processing loop in {Oracle 11g}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1368--1378", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454178", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jain:2008:TSS, author = "Namit Jain and Shailendra Mishra and Anand Srinivasan and Johannes Gehrke and Jennifer Widom and Hari Balakrishnan and U{\u{g}}ur {\c{C}}etintemel and Mitch Cherniack and Richard Tibbetts and Stan Zdonik", title = "Towards a streaming {SQL} standard", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1379--1390", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454179", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2008:ESG, author = "Yu Huang and Ziyang Liu and Yi Chen", title = "{eXtract}: a snippet generation system for {XML} search", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1392--1395", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454181", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Terwilliger:2008:LIQ, author = "James F. Terwilliger and Sergey Melnik and Philip A. Bernstein", title = "Language-integrated querying of {XML} data in {SQL} server", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1396--1399", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454182", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mathis:2008:XXC, author = "Christian Mathis and Andreas M. Weiner and Theo H{\"a}rder and Caesar Ralf Franz Hoppen", title = "{XTCcmp}: {XQuery} compilation on {XTC}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1400--1403", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454183", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tian:2008:PGG, author = "Yuanyuan Tian and Jignesh M. Patel and Viji Nair and Sebastian Martini and Matthias Kretzler", title = "{Periscope\slash GQ}: a graph querying toolkit", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1404--1407", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454184", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Balmin:2008:SSS, author = "Andrey Balmin and Latha Colby and Emiran Curtmola and Quanzhong Li and Fatma {\"O}zcan and Sharath Srinivas and Zografoula Vagena", title = "{SEDA}: a system for search, exploration, discovery, and analysis of {XML Data}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1408--1411", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454185", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Motahari:2008:PSD, author = "Hamid Motahari and Boualem Benatallah and Regis Saint-Paul and Fabio Casati and Periklis Andritsos", title = "Process spaceship: discovering and exploring process views from event logs in data spaces", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1412--1415", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454186", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lupu:2008:PPP, author = "Mihai Lupu and Y. C. Tay", title = "{P} 3 {N}: profiling the potential of a peer-based data management system", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1416--1419", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454188", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tlili:2008:PLT, author = "Mounir Tlili and W. Kokou Dedzoe and Esther Pacitti and Patrick Valduriez and Reza Akbarinia and Pascal Molli and G{\'e}r{\^o}me Canals and St{\'e}phane Lauri{\`e}re", title = "{P2P} logging and timestamping for reconciliation", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1420--1423", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454189", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Luu:2008:ASP, author = "Toan Luu and Gleb Skobeltsyn and Fabius Klemm and Maroje Puh and Ivana Podnar Zarko and Martin Rajman and Karl Aberer", title = "{AlvisP2P}: scalable peer-to-peer text retrieval in a structured {P2P} network", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1424--1427", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454190", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abiteboul:2008:WEP, author = "S. Abiteboul and T. Allard and P. Chatalic and G. Gardarin and A. Ghitescu and F. Goasdou{\'e} and I. Manolescu and B. Nguyen and M. Ouazara and A. Somani and N. Travers and G. Vasile and S. Zoupanos", title = "{WebContent}: efficient {P2P Warehousing} of {Web} data", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1428--1431", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454191", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jurczyk:2008:DED, author = "Pawel Jurczyk and Li Xiong", title = "{DObjects}: enabling distributed data services for metacomputing platforms", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1432--1435", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454192", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shao:2008:ETR, author = "Qihong Shao and Yi Chen and Shu Tao and Xifeng Yan and Nikos Anerousis", title = "{EasyTicket}: a ticket routing recommendation engine for enterprise problem resolution", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1436--1439", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454193", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Duda:2008:ACI, author = "Cristian Duda and Gianni Frey and Donald Kossmann and Chong Zhou", title = "{AJAXSearch}: crawling, indexing and searching {Web 2.0} applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1440--1443", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454195", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2008:MSH, author = "Kun Liu and Evimaria Terzi and Tyrone Grandison", title = "{ManyAspects}: a system for highlighting diverse concepts in documents", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1444--1447", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454196", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Curtmola:2008:XDC, author = "Emiran Curtmola and Alin Deutsch and Dionysios Logothetis and K. K. Ramakrishnan and Divesh Srivastava and Kenneth Yocum", title = "{XTreeNet}: democratic community search", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1448--1451", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454197", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2008:EVK, author = "Guoliang Li and Jianhua Feng and Jianyong Wang and Lizhu Zhou", title = "An effective and versatile keyword search engine on heterogeneous data sources", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1452--1455", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454198", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Baid:2008:DME, author = "Akanksha Baid and Andrey Balmin and Heasoo Hwang and Erik Nijkamp and Jun Rao and Berthold Reinwald and Alkis Simitsis and Yannis Sismanis and Frank van Ham", title = "{DBPubs}: multidimensional exploration of database publications", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1456--1459", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454199", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2008:SDQ, author = "Wenfei Fan and Floris Geerts and Xibei Jia", title = "{Semandaq}: a data quality system based on conditional functional dependencies", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1460--1463", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454200", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Katsis:2008:RTI, author = "Yannis Katsis and Alin Deutsch and Yannis Papakonstantinou and Keliang Zhao", title = "{RIDE}: a tool for interactive source registration in community-oriented information integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1464--1467", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454202", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Alexe:2008:CEM, author = "Bogdan Alexe and Wang-Chiew Tan and Yannis Velegrakis", title = "Comparing and evaluating mapping systems with {STBenchmark}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1468--1471", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454203", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Logothetis:2008:AHD, author = "Dionysios Logothetis and Kenneth Yocum", title = "Ad-hoc data processing in the cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1472--1475", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454204", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Weigel:2008:LSC, author = "Felix Weigel and Biswanath Panda and Mirek Riedewald and Johannes Gehrke and Manuel Calimlim", title = "Large-scale collaborative analysis and extraction of {Web} data", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1476--1479", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454205", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Crecelius:2008:MSS, author = "Tom Crecelius and Mouna Kacimi and Sebastian Michel and Thomas Neumann and Josiane Xavier Parreira and Ralf Schenkel and Gerhard Weikum", title = "Making {SENSE}: socially enhanced search and exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1480--1483", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454206", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lu:2008:ASD, author = "Wentian Lu and Gerome Miklau", title = "{AuditGuard}: a system for database auditing under retention restrictions", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1484--1487", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454207", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hu:2008:QVQ, author = "Ling Hu and Kenneth A. Ross and Yuan-Chi Chang and Christian A. Lang and Donghui Zhang", title = "{QueryScope}: visualizing queries for repeatable database tuning", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1488--1491", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454209", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hose:2008:WIT, author = "Katja Hose and Daniel Klan and Matthias Marx and Kai-Uwe Sattler", title = "When is it time to rethink the aggregate configuration of your {OLAP} server?", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1492--1495", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454210", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kallman:2008:HSH, author = "Robert Kallman and Hideaki Kimura and Jonathan Natkins and Andrew Pavlo and Alexander Rasin and Stanley Zdonik and Evan P. C. Jones and Samuel Madden and Michael Stonebraker and Yang Zhang and John Hugg and Daniel J. Abadi", title = "{H-store}: a high-performance, distributed main memory transaction processing system", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1496--1499", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454211", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Perlman:2008:OIN, author = "Eric Perlman and Randal Burns and Michael Kazhdan", title = "Organizing and indexing non-convex regions", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1500--1503", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454212", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Paquet:2008:CME, author = "Eric Paquet and Herna L. Viktor", title = "{Capri\slash MR}: exploring protein databases from a structural and physicochemical point of view", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1504--1507", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454213", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Guo:2008:CMM, author = "Fan Guo and Lei Li and Christos Faloutsos and Eric P. Xing", title = "{C-DEM}: a multi-modal query system for {Drosophila Embryo} databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1508--1511", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454214", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Milo:2008:QMD, author = "Tova Milo and Daniel Deutch", title = "Querying and monitoring distributed business processes", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1512--1515", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454216", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Franklin:2008:FTD, author = "Michael Franklin and Alon Halevy and David Maier", title = "A first tutorial on dataspaces", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1516--1517", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454217", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Franconi:2008:ODM, author = "Enrico Franconi", title = "Ontologies and databases: myths and challenges", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1518--1519", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454218", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Balazinska:2008:SAP, author = "Magdalena Balazinska and Christopher R{\'e} and Dan Suciu", title = "Systems aspects of probabilistic data management", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1520--1521", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454219", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2008:RIC, author = "Wenfei Fan and Floris Geerts and Xibei Jia", title = "A revival of integrity constraints for data cleaning", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1522--1523", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454220", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Moro:2008:XSS, author = "Mirella M. Moro and Zografoula Vagena and Vassilis J. Tsotras", title = "{XML Structural Summaries}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1524--1525", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454221", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sharaf:2008:SCQ, author = "Mohamed A. Sharaf and Alexandros Labrinidis and Panos K. Chrysanthis", title = "Scheduling continuous queries in data stream management systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1526--1527", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454222", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kriegel:2008:DCM, author = "Hans-Peter Kriegel and Peer Kr{\"o}ger and Arthur Zimek", title = "Detecting clusters in moderate-to-high dimensional data: subspace clustering, pattern-based clustering, and correlation clustering", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1528--1529", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454223", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cormode:2008:FFI, author = "Graham Cormode and Marios Hadjieleftheriou", title = "Finding frequent items in data streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1530--1541", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454225", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ding:2008:QMT, author = "Hui Ding and Goce Trajcevski and Peter Scheuermann and Xiaoyue Wang and Eamonn Keogh", title = "Querying and mining of time series data: experimental comparison of representations and distance measures", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1542--1552", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454226", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sidirourgos:2008:CSS, author = "Lefteris Sidirourgos and Romulo Goncalves and Martin Kersten and Niels Nes and Stefan Manegold", title = "Column-store support for {RDF} data management: not all swans are white", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1553--1563", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454227", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sans:2008:PBN, author = "Virginie Sans and Dominique Laurent", title = "Prefix based numbering schemes for {XML}: techniques, applications and performances", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1564--1573", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454228", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2008:BEM, author = "Su Chen and Christian S. Jensen and Dan Lin", title = "A benchmark for evaluating moving object indexes", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1574--1585", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454229", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dittrich:2008:DRM, author = "Jens Dittrich and Lukas Blunschi and Marcos Antonio Vaz Salles", title = "Dwarfs in the rearview mirror: how big are they really?", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1586--1597", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454230", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shao:2008:CTE, author = "Jie Shao and Heng Tao Shen and Xiaofang Zhou", title = "Challenges and techniques for effective and efficient similarity search in large video databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1598--1603", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454232", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hopfgartner:2008:SIM, author = "Frank Hopfgartner", title = "Studying interaction methodologies in video retrieval", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1604--1608", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454233", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lo:2008:MPR, author = "David Lo and Siau-Cheng Khoo", title = "Mining patterns and rules for software specification discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1609--1616", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454234", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Biveinis:2008:TEM, author = "Laurynas Biveinis and Simonas Saltenis", title = "Towards efficient main-memory use for optimum tree index update", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1617--1622", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454236", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Holupirek:2008:IFT, author = "Alexander Holupirek and Marc H. Scholl", title = "Implementing filesystems by tree-aware {DBMSs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1623--1630", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454237", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Avanes:2008:AWS, author = "Artin Avanes and Johann-Christoph Freytag", title = "Adaptive workflow scheduling under resource allocation constraints and network dynamics", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1631--1637", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454238", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zerr:2008:PPD, author = "Sergej Zerr and Wolfgang Nejdl", title = "Privacy preserving document indexing infrastructure for a distributed environment", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1638--1643", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454240", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Miao:2008:GTG, author = "Jiajia Miao", title = "{GS-TMS}: a global stream-based threat monitor system", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1644--1651", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454241", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kharlamov:2008:III, author = "Evgeny Kharlamov and Werner Nutt", title = "Incompleteness in information integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1652--1658", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454242", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deutch:2008:QWB, author = "Daniel Deutch and Tova Milo", title = "Querying {Web}-based applications under models of uncertainty", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1659--1665", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454244", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Silvasti:2008:XDF, author = "Panu Silvasti and Seppo Sippu and Eljas Soisalon-Soininen", title = "{XML}-document-filtering automaton", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1666--1671", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454245", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Scholl:2008:CDD, author = "Tobias Scholl and Alfons Kemper", title = "Community-driven data grids", journal = j-PROC-VLDB-ENDOWMENT, volume = "1", number = "2", pages = "1672--1677", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1454159.1454246", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:44 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gatterbauer:2009:BIA, author = "Wolfgang Gatterbauer and Magdalena Balazinska and Nodira Khoussainova and Dan Suciu", title = "Believe it or not: adding belief annotations to databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1--12", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2009:SSB, author = "Zhenjie Zhang and Beng Chin Ooi and Srinivasan Parthasarathy and Anthony K. H. Tung", title = "Similarity search on {Bregman} divergence: towards non-metric indexing", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "13--24", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zeng:2009:CSA, author = "Zhiping Zeng and Anthony K. H. Tung and Jianyong Wang and Jianhua Feng and Lizhu Zhou", title = "Comparing stars: on approximating graph edit distance", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "25--36", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Whang:2009:IBE, author = "Steven Euijong Whang and Hector Garcia-Molina and Chad Brower and Jayavel Shanmugasundaram and Sergei Vassilvitskii and Erik Vee and Ramana Yerneni", title = "Indexing {Boolean} expressions", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "37--48", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhou:2009:SDS, author = "Yongluan Zhou and Ali Salehi and Karl Aberer", title = "Scalable delivery of stream query result", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "49--60", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Benedikt:2009:SBI, author = "Michael Benedikt and James Cheney", title = "Schema-based independence analysis for {XML} updates", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "61--72", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nehme:2009:TSD, author = "Rimma V. Nehme and Elke A. Rundensteiner and Elisa Bertino", title = "Tagging stream data for rich real-time services", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "73--84", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sarma:2009:RMP, author = "Atish Das Sarma and Ashwin Lall and Danupon Nanongkai and Jun Xu", title = "Randomized multi-pass streaming skyline algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "85--96", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Reeves:2009:MMT, author = "Galen Reeves and Jie Liu and Suman Nath and Feng Zhao", title = "Managing massive time series streams with multi-scale compressed trickles", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "97--108", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2009:PAM, author = "Tianyi Wu and Dong Xin and Qiaozhu Mei and Jiawei Han", title = "Promotion analysis in multi-dimensional space", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "109--120", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sarkas:2009:MDK, author = "Nikos Sarkas and Nilesh Bansal and Gautam Das and Nick Koudas", title = "Measure-driven keyword-query expansion", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "121--132", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2009:UTD, author = "Bin Liu and H. V. Jagadish", title = "Using trees to depict a forest", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "133--144", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Elmeleegy:2009:OPW, author = "Hazem Elmeleegy and Ahmed K. Elmagarmid and Emmanuel Cecchet and Walid G. Aref and Willy Zwaenepoel", title = "Online piece-wise linear approximation of numerical streams with precision guarantees", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "145--156", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Stern:2009:WTE, author = "Mirco Stern and Erik Buchmann and Klemens B{\"o}hm", title = "A wavelet transform for efficient consolidation of sensor relations with quality guarantees", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "157--168", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yu:2009:EAQ, author = "Liu Yu and Jianzhong Li and Hong Gao and Xiaolin Fang", title = "Enabling $ \epsilon $-approximate querying in sensor networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "169--180", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nandi:2009:HUS, author = "Arnab Nandi and Philip A. Bernstein", title = "{HAMSTER}: using search clicklogs for schema and taxonomy matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "181--192", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kot:2009:CUE, author = "Lucja Kot and Christoph Koch", title = "Cooperative update exchange in the {Youtopia} system", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "193--204", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Papapetrou:2009:RBA, author = "Panagiotis Papapetrou and Vassilis Athitsos and George Kollios and Dimitrios Gunopulos", title = "Reference-based alignment in large sequence databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "205--216", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Das:2009:TCM, author = "Sudipto Das and Shyam Antony and Divyakant Agrawal and Amr {El Abbadi}", title = "Thread cooperation in multicore architectures for frequency counting over multiple data streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "217--228", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mueller:2009:SWQ, author = "Rene Mueller and Jens Teubner and Gustavo Alonso", title = "Streams on wires: a query compiler for {FPGAs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "229--240", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chandramouli:2009:FPD, author = "Badrish Chandramouli and Jonathan Goldstein and David Maier", title = "On-the-fly progress detection in iterative stream queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "241--252", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kraska:2009:CRC, author = "Tim Kraska and Martin Hentschel and Gustavo Alonso and Donald Kossmann", title = "Consistency rationing in the cloud: pay only when it matters", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "253--264", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lomet:2009:LKR, author = "David Lomet and Mohamed F. Mokbel", title = "Locking key ranges with unbundled transaction services", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "265--276", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Candea:2009:SPJ, author = "George Candea and Neoklis Polyzotis and Radek Vingralek", title = "A scalable, predictable join operator for highly concurrent data warehouses", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "277--288", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gupta:2009:ATA, author = "Rahul Gupta and Sunita Sarawagi", title = "Answering table augmentation queries from unstructured lists on the {Web}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "289--300", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cautis:2009:ERX, author = "Bogdan Cautis and Alin Deutsch and Nicola Onose and Vasilis Vassalos", title = "Efficient rewriting of {XPath} queries using {Query Set Specifications}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "301--312", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2009:SSR, author = "Ziyang Liu and Peng Sun and Yi Chen", title = "Structured search result differentiation", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "313--324", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dragut:2009:HAM, author = "Eduard C. Dragut and Thomas Kabisch and Clement Yu and Ulf Leser", title = "A hierarchical approach to model {Web} query interfaces for {Web} source integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "325--336", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cong:2009:ERT, author = "Gao Cong and Christian S. Jensen and Dingming Wu", title = "Efficient retrieval of the top-$k$ most relevant spatial {Web} objects", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "337--348", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dragut:2009:SWR, author = "Eduard Dragut and Fang Fang and Prasad Sistla and Clement Yu and Weiyi Meng", title = "Stop word and related problems in {Web} interface integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "349--360", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Agrawal:2009:LAT, author = "Devesh Agrawal and Deepak Ganesan and Ramesh Sitaraman and Yanlei Diao and Shashi Singh", title = "Lazy-Adaptive {Tree}: an optimized index structure for flash devices", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "361--372", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lee:2009:MDM, author = "Rubao Lee and Xiaoning Ding and Feng Chen and Qingda Lu and Xiaodong Zhang", title = "{MCC-DB}: minimizing cache conflicts in multi-core processors for databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "373--384", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Willhalm:2009:SSU, author = "Thomas Willhalm and Nicolae Popovici and Yazan Boshmaf and Hasso Plattner and Alexander Zeier and Jan Schaffner", title = "{SIMD-scan}: ultra fast in-memory table scan using on-chip vector processing units", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "385--394", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chaudhuri:2009:MDC, author = "Surajit Chaudhuri and Venkatesh Ganti and Dong Xin", title = "Mining document collections to facilitate accurate approximate entity matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "395--406", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2009:RAR, author = "Wenfei Fan and Xibei Jia and Jianzhong Li and Shuai Ma", title = "Reasoning about record matching rules", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "407--418", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dobra:2009:TCE, author = "Alin Dobra and Chris Jermaine and Florin Rusu and Fei Xu", title = "Turbo-charging estimate convergence in {DBO}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "419--430", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cohen:2009:CSA, author = "Edith Cohen and Nick Duffield and Haim Kaplan and Carsten Lund and Mikkel Thorup", title = "Composable, scalable, and accurate weight summarization of unaggregated data sets", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "431--442", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2009:DOA, author = "Sai Wu and Shouxu Jiang and Beng Chin Ooi and Kian-Lee Tan", title = "Distributed online aggregations", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "443--454", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Koloniari:2009:RBC, author = "Georgia Koloniari and Evaggelia Pitoura", title = "A recall-based cluster formation game in peer-to-peer systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "455--466", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fekete:2009:QIA, author = "Alan Fekete and Shirley N. Goldrei and Jorge P{\'e}rez Asenjo", title = "Quantifying isolation anomalies", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "467--478", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Johnson:2009:IOS, author = "Ryan Johnson and Ippokratis Pandis and Anastasia Ailamaki", title = "Improving {OLTP} scalability using speculative lock inheritance", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "479--489", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sears:2009:SBR, author = "Russell Sears and Eric Brewer", title = "Segment-based recovery: write-ahead logging revisited", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "490--501", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2009:UAR, author = "Jian Li and Barna Saha and Amol Deshpande", title = "A unified approach to ranking in probabilistic databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "502--513", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Arasu:2009:LST, author = "Arvind Arasu and Surajit Chaudhuri and Raghav Kaushik", title = "Learning string transformations from examples", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "514--525", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cormode:2009:PHP, author = "Graham Cormode and Antonios Deligiannakis and Minos Garofalakis and Andrew McGregor", title = "Probabilistic histograms for probabilistic data", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "526--537", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Greenshpan:2009:AM, author = "Ohad Greenshpan and Tova Milo and Neoklis Polyzotis", title = "Autocompletion for mashups", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "538--549", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dong:2009:ICD, author = "Xin Luna Dong and Laure Berti-Equille and Divesh Srivastava", title = "Integrating conflicting data: the role of source dependence", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "550--561", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dong:2009:TDC, author = "Xin Luna Dong and Laure Berti-Equille and Divesh Srivastava", title = "Truth discovery and copying detection in a dynamic world", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "562--573", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Golab:2009:SD, author = "Lukasz Golab and Howard Karloff and Flip Korn and Avishek Saha and Divesh Srivastava", title = "Sequential dependencies", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "574--585", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Denev:2009:SFQ, author = "Dimitar Denev and Arturas Mazeika and Marc Spaniol and Gerhard Weikum", title = "{SHARC}: framework for quality-conscious {Web} archiving", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "586--597", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Beskales:2009:MQP, author = "George Beskales and Mohamed A. Soliman and Ihab F. Ilyas and Shai Ben-David", title = "Modeling and querying possible repairs in duplicate detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "598--609", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mindolin:2009:DRI, author = "Denis Mindolin and Jan Chomicki", title = "Discovering relative importance of skyline attributes", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "610--621", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kim:2009:PDB, author = "Min-Soo Kim and Jiawei Han", title = "A particle-and-density based evolutionary clustering method for dynamic networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "622--633", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2009:SRD, author = "Xiaoyan Yang and Cecilia M. Procopiuc and Divesh Srivastava", title = "Summarizing relational databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "634--645", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cohen:2009:CWS, author = "Edith Cohen and Haim Kaplan and Subhabrata Sen", title = "Coordinated weighted sampling for estimating aggregates over multiple weight assignments", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "646--657", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lee:2009:PLB, author = "Hongrae Lee and Raymond T. Ng and Kyuseok Shim", title = "Power-law based estimation of set similarity join size", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "658--669", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Karras:2009:OSL, author = "Panagiotis Karras", title = "Optimality and scalability in lattice histogram construction", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "670--681", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Vigfusson:2009:APD, author = "Ymir Vigfusson and Adam Silberstein and Brian F. Cooper and Rodrigo Fonseca", title = "Adaptively parallelizing distributed range queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "682--693", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tatikonda:2009:MTS, author = "Shirish Tatikonda and Srinivasan Parthasarathy", title = "Mining tree-structured data on multicore systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "694--705", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Unterbrunner:2009:PPU, author = "P. Unterbrunner and G. Giannikis and G. Alonso and D. Fauser and D. Kossmann", title = "Predictable performance for unpredictable workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "706--717", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhou:2009:GCB, author = "Yang Zhou and Hong Cheng and Jeffrey Xu Yu", title = "Graph clustering based on structural\slash attribute similarities", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "718--729", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{AlHasan:2009:OSS, author = "Mohammad {Al Hasan} and Mohammed J. Zaki", title = "Output space sampling for graph patterns", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "730--741", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2009:MGP, author = "Chen Chen and Cindy X. Lin and Matt Fredrikson and Mihai Christodorescu and Xifeng Yan and Jiawei Han", title = "Mining graph patterns efficiently via randomized summaries", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "742--753", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Amer-Yahia:2009:GRS, author = "Sihem Amer-Yahia and Senjuti Basu Roy and Ashish Chawlat and Gautam Das and Cong Yu", title = "Group recommendation: semantics and efficiency", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "754--765", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bhagat:2009:CBG, author = "Smriti Bhagat and Graham Cormode and Balachander Krishnamurthy and Divesh Srivastava", title = "Class-based graph anonymization for social network data", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "766--777", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sarkas:2009:ISS, author = "Nikos Sarkas and Gautam Das and Nick Koudas", title = "Improved search for socially annotated data", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "778--789", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Machanavajjhala:2009:DPA, author = "Ashwin Machanavajjhala and Johannes Gehrke and Michaela G{\"o}tz", title = "Data publishing against realistic adversaries", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "790--801", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pang:2009:SVO, author = "HweeHwa Pang and Jilian Zhang and Kyriakos Mouratidis", title = "Scalable verification for outsourced dynamic databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "802--813", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xiao:2009:ORP, author = "Xiaokui Xiao and Yufei Tao and Minghua Chen", title = "Optimal random perturbation at multiple privacy levels", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "814--825", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Assent:2009:ADE, author = "Ira Assent and Marc Wichterich and Ralph Krieger and Hardy Kremer and Thomas Seidl", title = "Anticipatory {DTW} for efficient similarity search in time series databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "826--837", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tsirogiannis:2009:IPL, author = "Dimitris Tsirogiannis and Sudipto Guha and Nick Koudas", title = "Improving the performance of list intersection", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "838--849", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kaushik:2009:CHP, author = "Raghav Kaushik and Dan Suciu", title = "Consistent histograms in the presence of distinct value counts", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "850--861", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Aggarwal:2009:GCI, author = "Charu Aggarwal and Yan Xie and Philip S. Yu", title = "{GConnect}: a connectivity index for massive disk-resident graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "862--873", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2009:SES, author = "Di Yang and Elke A. Rundensteiner and Matthew O. Ward", title = "A shared execution strategy for multiple pattern mining requests over streaming data", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "874--885", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zou:2009:DJP, author = "Lei Zou and Lei Chen and M. Tamer {\"O}zsu", title = "{Distance-join}: pattern match query in a large graph database", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "886--897", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wan:2009:CCP, author = "Qian Wan and Raymond Chi-Wing Wong and Ihab F. Ilyas and M. Tamer {\"O}zsu and Yu Peng", title = "Creating competitive products", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "898--909", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mueller:2009:DPF, author = "Rene Mueller and Jens Teubner and Gustavo Alonso", title = "Data processing on {FPGAs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "910--921", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abouzeid:2009:HAH, author = "Azza Abouzeid and Kamil Bajda-Pawlikowski and Daniel Abadi and Avi Silberschatz and Alexander Rasin", title = "{HadoopDB}: an architectural hybrid of {MapReduce} and {DBMS} technologies for analytical workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "922--933", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{He:2009:ASV, author = "Yeye He and Jeffrey F. Naughton", title = "Anonymization of set-valued data via top-down, local generalization", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "934--945", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zou:2009:AGF, author = "Lei Zou and Lei Chen and M. Tamer {\"O}zsu", title = "$k$-automorphism: a general framework for privacy preserving network publication", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "946--957", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Koudas:2009:DBM, author = "Nick Koudas and Divesh Srivastava and Ting Yu and Qing Zhang", title = "Distribution based microdata anonymization", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "958--969", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Meier:2009:CTB, author = "Michael Meier and Michael Schmidt and Georg Lausen", title = "On chase termination beyond stratification", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "970--981", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Moerkotte:2009:PBP, author = "Guido Moerkotte and Thomas Neumann and Gabriele Steidl", title = "Preventing bad plans by bounding the impact of cardinality estimation errors", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "982--993", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chaudhuri:2009:ECQ, author = "Surajit Chaudhuri and Vivek Narasayya and Ravi Ramamurthy", title = "Exact cardinality query optimization for optimizer testing", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "994--1005", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{tenCate:2009:LSM, author = "Balder ten Cate and Laura Chiticariu and Phokion Kolaitis and Wang-Chiew Tan", title = "Laconic schema mappings: computing the core with {SQL} queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1006--1017", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Arenas:2009:ISM, author = "Marcelo Arenas and Jorge P{\'e}rez and Juan Reutter and Cristian Riveros", title = "Inverting schema mappings: bridging the gap between theory and practice", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1018--1029", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Terwilliger:2009:FFF, author = "James F. Terwilliger and Philip A. Bernstein and Sergey Melnik", title = "Full-fidelity flexible object-oriented {XML} access", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1030--1041", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2009:PAM, author = "Ting Wang and Ling Liu", title = "Privacy-aware mobile services over road networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1042--1053", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{U:2009:FAA, author = "Leong Hou U. and Nikos Mamoulis and Kyriakos Mouratidis", title = "A fair assignment algorithm for multiple preference queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1054--1065", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mishima:2009:PED, author = "Takeshi Mishima and Hiroshi Nakamura", title = "{Pangea}: an eager database replication middleware guaranteeing snapshot isolation without modification of database servers", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1066--1077", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Elmeleegy:2009:HRT, author = "Hazem Elmeleegy and Jayant Madhavan and Alon Halevy", title = "Harvesting relational tables from lists on the web", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1078--1089", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cafarella:2009:DIR, author = "Michael J. Cafarella and Alon Halevy and Nodira Khoussainova", title = "Data integration for the relational web", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1090--1101", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gottlob:2009:NOS, author = "Georg Gottlob and Reinhard Pichler and Vadim Savenkov", title = "Normalization and optimization of schema mappings", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1102--1113", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xing:2009:CMN, author = "Songhua Xing and Cyrus Shahabi and Bei Pan", title = "Continuous monitoring of nearest neighbors on land surface", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1114--1125", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wong:2009:EMM, author = "Raymond Chi-Wing Wong and M. Tamer {\"O}zsu and Philip S. Yu and Ada Wai-Chee Fu and Lian Liu", title = "Efficient method for maximizing bichromatic reverse nearest neighbor", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1126--1137", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cheema:2009:LUE, author = "Muhammad Aamir Cheema and Xuemin Lin and Ying Zhang and Wei Wang and Wenjie Zhang", title = "Lazy updates: an efficient technique to continuously monitoring reverse {kNN}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1138--1149", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2009:NMM, author = "Ling Chen and Sourav S. Bhowmick and Wolfgang Nejdl", title = "{NEAR-Miner}: mining evolution associations of {Web} site directories for efficient maintenance of {Web} archives", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1150--1161", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wong:2009:AEO, author = "W. K. Wong and David W. Cheung and Edward Hung and Ben Kao and Nikos Mamoulis", title = "An audit environment for outsourcing of frequent itemset mining", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1162--1173", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mozafari:2009:PNB, author = "Barzan Mozafari and Carlo Zaniolo", title = "Publishing naive {Bayesian} classifiers: privacy without accuracy loss", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1174--1185", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tzoumas:2009:WAI, author = "Kostas Tzoumas and Man Lung Yiu and Christian S. Jensen", title = "Workload-aware indexing of continuously moving objects", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1186--1197", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2009:EIU, author = "Meihui Zhang and Su Chen and Christian S. Jensen and Beng Chin Ooi and Zhenjie Zhang", title = "Effectively indexing uncertain moving objects for predictive queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1198--1209", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sankaranarayanan:2009:POS, author = "Jagan Sankaranarayanan and Hanan Samet and Houman Alborzi", title = "Path oracles for spatial networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1210--1221", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kimura:2009:CMC, author = "Hideaki Kimura and George Huo and Alexander Rasin and Samuel Madden and Stanley B. Zdonik", title = "Correlation maps: a compressed access method for exploiting soft functional dependencies", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1222--1233", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Schnaitter:2009:IIP, author = "Karl Schnaitter and Neoklis Polyzotis and Lise Getoor", title = "Index interactions in physical design tuning: modeling, analysis, and applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1234--1245", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Duan:2009:TDC, author = "Songyun Duan and Vamsidhar Thummala and Shivnath Babu", title = "Tuning database configuration parameters with {iTuned}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1246--1257", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Salles:2009:ECR, author = "Marcos Vaz Salles and Tuan Cao and Benjamin Sowell and Alan Demers and Johannes Gehrke and Christoph Koch and Walker White", title = "An evaluation of checkpoint recovery for massively multiplayer online games", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1258--1269", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Muller:2009:ECS, author = "Emmanuel M{\"u}ller and Stephan G{\"u}nnemann and Ira Assent and Thomas Seidl", title = "Evaluating clustering in subspace projections of high dimensional data", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1270--1281", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hassanzadeh:2009:FEC, author = "Oktie Hassanzadeh and Fei Chiang and Hyun Chul Lee and Ren{\'e}e J. Miller", title = "Framework for evaluating clustering algorithms in duplicate detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "1", pages = "1282--1293", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:50 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Guo:2009:DMM, author = "Hongfei Guo and Dan Jones and Jennifer Beckmann and Praveen Seshadri", title = "Declarative management in {Microsoft SQL} server", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1294--1305", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{El-Helw:2009:SRS, author = "Amr El-Helw and Ihab F. Ilyas and Calisto Zuzarte", title = "{StatAdvisor}: recommending statistical views", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1306--1317", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Canim:2009:OPA, author = "Mustafa Canim and George A. Mihaila and Bishwaranjan Bhattacharjee and Kenneth A. Ross and Christian A. Lang", title = "An object placement advisor for {DB2} using solid state storage", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1318--1329", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bhide:2009:XXP, author = "Manish Bhide and Manoj K. Agarwal and Amir Bar-Or and Sriram Padmanabhan and Srinivas K. Mittapalli and Girish Venkatachaliah", title = "{XPEDIA}: {XML} processing for data integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1330--1341", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bamford:2009:XR, author = "Roger Bamford and Vinayak Borkar and Matthias Brantner and Peter M. Fischer and Daniela Florescu and David Graf and Donald Kossmann and Tim Kraska and Dan Muresan and Sorin Nasoi and Markos Zacharioudakis", title = "{XQuery} reloaded", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1342--1353", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2009:BXS, author = "Ning Zhang and Nipun Agarwal and Sivasankaran Chandrasekar and Sam Idicula and Vijay Medi and Sabina Petride and Balasubramanyam Sthanikam", title = "Binary {XML} storage and query processing in {Oracle 11g}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1354--1365", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bellamkonda:2009:ESO, author = "Srikanth Bellamkonda and Rafi Ahmed and Andrew Witkowski and Angela Amor and Mohamed Zait and Chun-Chieh Lin", title = "Enhanced subquery optimizations in {Oracle}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1366--1377", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kim:2009:SVH, author = "Changkyu Kim and Tim Kaldewey and Victor W. Lee and Eric Sedlar and Anthony D. Nguyen and Nadathur Satish and Jatin Chhugani and Andrea {Di Blas} and Pradeep Dubey", title = "Sort vs. {Hash} revisited: fast join implementation on modern multi-core {CPUs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1378--1389", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xu:2009:EOJ, author = "Yu Xu and Pekka Kostamaa", title = "Efficient outer join data skew handling in parallel {DBMS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1390--1396", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } %%% Page gap v2n2 pp 1397--1401 at journal Web site @Article{Friedman:2009:SMP, author = "Eric Friedman and Peter Pawlowski and John Cieslewicz", title = "{SQL\slash MapReduce}: a practical approach to self-describing, polymorphic, and parallelizable user-defined functions", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1402--1413", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gates:2009:BHL, author = "Alan F. Gates and Olga Natkovich and Shubham Chopra and Pradeep Kamath and Shravan M. Narayanamurthy and Christopher Olston and Benjamin Reed and Santhosh Srinivasan and Utkarsh Srivastava", title = "Building a high-level dataflow system on top of {Map-Reduce}: the {Pig} experience", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1414--1425", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Panda:2009:PMP, author = "Biswanath Panda and Joshua S. Herbach and Sugato Basu and Roberto J. Bayardo", title = "{PLANET}: massively parallel learning of tree ensembles with {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1426--1437", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Legler:2009:RDT, author = "Thomas Legler and Wolfgang Lehner and Jan Schaffner and Jens Kr{\"u}ger", title = "Robust and distributed top-n frequent-pattern mining with {SAP BW} accelerator", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1438--1449", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dieu:2009:TUF, author = "Nicolas Dieu and Adrian Dragusanu and Fran{\c{c}}oise Fabret and Fran{\c{c}}ois Llirbat and Eric Simon", title = "1,000 tables under the form", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1450--1461", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bhattacharjee:2009:EIC, author = "Bishwaranjan Bhattacharjee and Lipyeow Lim and Timothy Malkemus and George Mihaila and Kenneth Ross and Sherman Lau and Cathy McArthur and Zoltan Toth and Reza Sherkat", title = "Efficient index compression in {DB2 LUW}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1462--1473", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lacroix:2009:SSW, author = "Zo{\'e} Lacroix and Christophe Legendre and Spyro Mousses", title = "Storing scientific workflows in a database", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1474--1480", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cohen:2009:MSN, author = "Jeffrey Cohen and Brian Dolan and Mark Dunlap and Joseph M. Hellerstein and Caleb Welton", title = "{MAD} skills: new analysis practices for big data", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1481--1492", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ley:2009:DSL, author = "Michael Ley", title = "{DBLP}: some lessons learned", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1493--1500", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mukherjee:2009:OSP, author = "Niloy Mukherjee and Amit Ganesh and Vinayagam Djegaradjane and Sujatha Muthulingam and Wei Zhang and Krishna Kunchithapadam and Scott Lynn and Bharath Aleti and Kam Shergill and Shaoyu Wang", title = "{Oracle SecureFiles}: prepared for the digital deluge", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1501--1511", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Baumgartner:2009:SWD, author = "Robert Baumgartner and Georg Gottlob and Marcus Herzog", title = "Scalable {Web} data extraction for online market intelligence", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1512--1523", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rajaraman:2009:KHP, author = "Anand Rajaraman", title = "{Kosmix}: high-performance topic exploration using the deep {Web}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1524--1529", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nehme:2009:QMM, author = "Rimma V. Nehme and Karen E. Works and Elke A. Rundensteiner and Elisa Bertino", title = "Query mesh: multi-route query processing technology", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1530--1533", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cudre-Mauroux:2009:DSS, author = "P. Cudre-Mauroux and H. Kimura and K.-T. Lim and J. Rogers and R. Simakov and E. Soroush and P. Velikhov and D. L. Wang and M. Balazinska and J. Becla and D. DeWitt and B. Heath and D. Maier and S. Madden and J. Patel and M. Stonebraker and S. Zdonik", title = "A demonstration of {SciDB}: a science-oriented {DBMS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1534--1537", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2009:MMM, author = "Kuien Liu and Ke Deng and Zhiming Ding and Mingshu Li and Xiaofang Zhou", title = "{MOIR\slash MT}: monitoring large-scale road network traffic in real-time", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1538--1541", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Colle:2009:ODR, author = "Romain Colle and Leonidas Galanis and Supiti Buranawatanachoke and Stratos Papadomanolakis and Yujun Wang", title = "{Oracle Database Replay}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1542--1545", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Borisov:2009:DPD, author = "Nedyalko Borisov and Shivnath Babu and Sandeep Uttamchandani and Ramani Routray and Aameek Singh", title = "{DIADS}: a problem diagnosis tool for databases and storage area networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1546--1549", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Herschel:2009:ASA, author = "Melanie Herschel and Mauricio A. Hern{\'a}ndez and Wang-Chiew Tan", title = "{Artemis}: a system for analyzing missing answers", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1550--1553", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2009:DTS, author = "Eugene Wu and Philippe Cudre-Mauroux and Samuel Madden", title = "Demonstration of the {TrajStore} system", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1554--1557", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ali:2009:MCS, author = "M. H. Ali and C. Gerea and B. S. Raman and B. Sezgin and T. Tarnavski and T. Verona and P. Wang and P. Zabback and A. Ananthanarayan and A. Kirilov and M. Lu and A. Raizman and R. Krishnan and R. Schindlauer and T. Grabs and S. Bjeletich and B. Chandramouli and J. Goldstein and S. Bhat and Ying Li and V. {Di Nicola} and X. Wang and David Maier and S. Grell and O. Nano and I. Santos", title = "{Microsoft CEP Server} and online behavioral targeting", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1558--1561", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Krompass:2009:TMD, author = "Stefan Krompass and Harumi Kuno and Janet L. Wiener and Kevin Wilkinson and Umeshwar Dayal and Alfons Kemper", title = "A testbed for managing dynamic mixed workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1562--1565", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ahmad:2009:DSC, author = "Yanif Ahmad and Christoph Koch", title = "{DBToaster}: a {SQL} compiler for high-performance delta processing in main-memory databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1566--1569", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Preda:2009:AAK, author = "Nicoleta Preda and Fabian M. Suchanek and Gjergji Kasneci and Thomas Neumann and Maya Ramanath and Gerhard Weikum", title = "{ANGIE}: active knowledge for interactive exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1570--1573", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kopcke:2009:CEE, author = "Hanna K{\"o}pcke and Andreas Thor and Erhard Rahm", title = "Comparative evaluation of entity resolution approaches with {FEVER}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1574--1577", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Brauer:2009:RDR, author = "Falk Brauer and Wojciech Barczynski and Gregor Hackenbroich and Marcus Schramm and Adrian Mocan and Felix F{\"o}rster", title = "{RankIE}: document retrieval on ranked entity graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1578--1581", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mecca:2009:CEM, author = "Giansalvatore Mecca and Paolo Papotti and Salvatore Raunich and Marcello Buoncristiano", title = "Concise and expressive mappings with +Spicy", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1582--1585", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cruz:2009:AEM, author = "Isabel F. Cruz and Flavio Palandri Antonelli and Cosmin Stroe", title = "{AgreementMaker}: efficient matching for large real-world schemas and ontologies", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1586--1589", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hassanzadeh:2009:LQW, author = "Oktie Hassanzadeh and Reynold Xin and Ren{\'e}e J. Miller and Anastasios Kementsietsidis and Lipyeow Lim and Min Wang", title = "{Linkage Query Writer}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1590--1593", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2009:SEE, author = "Xiaoyuan Wang and Xingzhi Sun and Feng Cao and Li Ma and Nick Kanellos and Kang Zhang and Yue Pan and Yong Yu", title = "{SMDM}: enhancing enterprise-wide master data management using semantic {Web} technologies", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1594--1597", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gubanov:2009:IUR, author = "Michael N. Gubanov and Lucian Popa and Howard Ho and Hamid Pirahesh and Jeng-Yih Chang and Shr-Chang Chen", title = "{IBM UFO} repository: object-oriented data integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1598--1601", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2009:MSW, author = "Huajun Chen and Bin Lu and Yuan Ni and Guotong Xie and Chunying Zhou and Jinhua Mi and Zhaohui Wu", title = "Mashup by surfing a {Web} of data {APIs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1602--1605", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pichler:2009:DDE, author = "Reinhard Pichler and Vadim Savenkov", title = "{DEMo}: data exchange modeling tool", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1606--1609", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Letchner:2009:LDW, author = "Julie Letchner and Christopher R{\'e} and Magdalena Balazinska and Matthai Philipose", title = "Lahar demonstration: warehousing {Markovian} streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1610--1613", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sun:2009:WAC, author = "Peng Sun and Ziyang Liu and Sivaramakrishnan Natarajan and Susan B. Davidson and Yi Chen", title = "{WOLVES}: achieving correct provenance analysis by detecting and resolving unsound workflow views", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1614--1617", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dai:2009:TTI, author = "Chenyun Dai and Gabriel Ghinita and Elisa Bertino and Ji-Won Byun and Ninghui Li", title = "{TIAMAT}: a tool for interactive analysis of microdata anonymization techniques", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1618--1621", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yu:2009:IIN, author = "Yintao Yu and Cindy X. Lin and Yizhou Sun and Chen Chen and Jiawei Han and Binbin Liao and Tianyi Wu and ChengXiang Zhai and Duo Zhang and Bo Zhao", title = "{iNextCube}: information network-enhanced text cube", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1622--1625", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Thusoo:2009:HWS, author = "Ashish Thusoo and Joydeep Sen Sarma and Namit Jain and Zheng Shao and Prasad Chakka and Suresh Anthony and Hao Liu and Pete Wyckoff and Raghotham Murthy", title = "{Hive}: a warehousing solution over a map-reduce framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1626--1629", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Satish:2009:TEB, author = "Arjun Satish and Ramesh Jain and Amarnath Gupta", title = "{Tolkien}: an event based storytelling system", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1630--1633", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sarigol:2009:ESN, author = "Emre Sarig{\"o}l and Oriana Riva and Patrick Stuedi and Gustavo Alonso", title = "Enabling social networking in ad hoc networks of mobile phones", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1634--1637", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bao:2009:PVD, author = "Zhuowei Bao and Sarah Cohen-Boulakia and Susan B. Davidson and Pierrick Girard", title = "{PDiffView}: viewing the difference in provenance of workflow results", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1638--1641", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deutch:2009:GOW, author = "Daniel Deutch and Tova Milo and Tom Yam", title = "Goal-oriented {Web}-site navigation for on-line shoppers", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1642--1645", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pereira:2009:AWQ, author = "Fernando Pereira and Anand Rajaraman and Sunita Sarawagi and William Tunstall-Pedoe and Gerhard Weikum and Alon Halevy", title = "Answering {Web} questions using structured data: dream or reality?", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1646--1646", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bernstein:2009:HBB, author = "Philip A. Bernstein and Daniel J. Abadi and Michael J. Cafarella and Joseph M. Hellerstein and Donald Kossmann and Samuel Madden", title = "How best to build {Web}-scale data managers?", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1647--1647", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Manegold:2009:DAE, author = "Stefan Manegold and Martin L. Kersten and Peter Boncz", title = "Database architecture evolution: mammals flourished long before dinosaurs became extinct", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1648--1653", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dong:2009:DFR, author = "Xin Luna Dong and Felix Naumann", title = "Data fusion: resolving data conflicts for integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1654--1655", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Heer:2009:DVS, author = "Jeffrey Heer and Joseph M. Hellerstein", title = "Data visualization and social data analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1656--1657", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chaudhuri:2009:KQR, author = "Surajit Chaudhuri and Gautam Das", title = "Keyword querying and ranking in databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1658--1659", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hadjieleftheriou:2009:EAS, author = "Marios Hadjieleftheriou and Chen Li", title = "Efficient approximate search on string collections", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1660--1661", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Srivastava:2009:ITD, author = "Divesh Srivastava and Suresh Venkatasubramanian", title = "Information theory for data management", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1662--1663", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abadi:2009:COD, author = "Daniel J. Abadi and Peter A. Boncz and Stavros Harizopoulos", title = "Column-oriented database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "2", number = "2", pages = "1664--1665", month = aug, year = "2009", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:54:57 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Srivastava:2010:ERT, author = "Divesh Srivastava and Lukasz Golab and Rick Greer and Theodore Johnson and Joseph Seidel and Vladislav Shkapenyuk and Oliver Spatscheck and Jennifer Yates", title = "Enabling real time data analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1--2", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Matsudaira:2010:HEB, author = "Paul Matsudaira", title = "High-end biological imaging generates very large {$3$D+} and dynamic datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "3--3", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cho:2010:DWD, author = "Junghoo Cho and Hector Garcia-Molina", title = "Dealing with {Web} data: history and look ahead", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "4--4", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", remark = "10-year best paper award", } @Article{Kemme:2010:DRT, author = "Bettina Kemme and Gustavo Alonso", title = "Database replication: a tale of research across communities", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "5--12", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", remark = "10-year best paper award", } @Article{Canim:2010:BDR, author = "Mustafa Canim and Murat Kantarcio{\u{g}}lu and Bijit Hore and Sharad Mehrotra", title = "Building disclosure risk aware query optimizers for relational databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "13--24", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Allard:2010:SPD, author = "Tristan Allard and Nicolas Anciaux and Luc Bouganim and Yanli Guo and Lionel Le Folgoc and Benjamin Nguyen and Philippe Pucheral and Indrajit Ray and Indrakshi Ray and Shaoyi Yin", title = "Secure personal data servers: a vision paper", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "25--35", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fabbri:2010:PMR, author = "Daniel Fabbri and Kristen LeFevre and Qiang Zhu", title = "{PolicyReplay}: misconfiguration-response queries for data breach reporting", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "36--47", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Curino:2010:SWD, author = "Carlo Curino and Evan Jones and Yang Zhang and Sam Madden", title = "{Schism}: a workload-driven approach to database replication and partitioning", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "48--57", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qin:2010:TTS, author = "Lu Qin and Jeffrey Xu Yu and Lijun Chang", title = "Ten thousand {SQLs}: parallel keyword queries computing", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "58--69", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Thomson:2010:CDD, author = "Alexander Thomson and Daniel J. Abadi", title = "The case for determinism in database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "70--80", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Alexe:2010:MCI, author = "Bogdan Alexe and Mauricio Hern{\'a}ndez and Lucian Popa and Wang-Chiew Tan", title = "{MapMerge}: correlating independent schema mappings", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "81--92", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Greco:2010:CTC, author = "Sergio Greco and Francesca Spezzano", title = "Chase termination: a constraints rewriting approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "93--104", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Marnette:2010:SDE, author = "Bruno Marnette and Giansalvatore Mecca and Paolo Papotti", title = "Scalable data exchange with functional dependencies", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "105--116", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kanza:2010:IRS, author = "Yaron Kanza and Roy Levin and Eliyahu Safra and Yehoshua Sagiv", title = "Interactive route search in the presence of order constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "117--128", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lang:2010:EMM, author = "Willis Lang and Jignesh M. Patel", title = "Energy management for {MapReduce} clusters", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "129--139", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Baid:2010:TSK, author = "Akanksha Baid and Ian Rae and Jiexing Li and AnHai Doan and Jeffrey Naughton", title = "Toward scalable keyword search over relational data", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "140--149", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mozafari:2010:REN, author = "Barzan Mozafari and Kai Zeng and Carlo Zaniolo", title = "From regular expressions to nested words: unifying languages and query execution for relational and {XML} sequences", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "150--161", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Grust:2010:ASL, author = "Torsten Grust and Jan Rittinger and Tom Schreiber", title = "Avalanche-safe {LINQ} compilation", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "162--172", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2010:TCF, author = "Wenfei Fan and Jianzhong Li and Shuai Ma and Nan Tang and Wenyuan Yu", title = "Towards certain fixes with editing rules and master data", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "173--184", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Herschel:2010:EMA, author = "Melanie Herschel and Mauricio A. Hern{\'a}ndez", title = "Explaining missing answers to {SPJUA} queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "185--196", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Beskales:2010:SRF, author = "George Beskales and Ihab F. Ilyas and Lukasz Golab", title = "Sampling the repairs of functional dependency violations under hard constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "197--207", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Menestrina:2010:EER, author = "David Menestrina and Steven Euijong Whang and Hector Garcia-Molina", title = "Evaluating entity resolution results", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "208--219", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chandramouli:2010:HPD, author = "Badrish Chandramouli and Jonathan Goldstein and David Maier", title = "High-performance dynamic pattern matching over disordered streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "220--231", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Botan:2010:SMA, author = "Irina Botan and Roozbeh Derakhshan and Nihal Dindar and Laura Haas and Ren{\'e}e J. Miller and Nesime Tatbul", title = "{SECRET}: a model for analysis of the execution semantics of stream processing systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "232--243", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2010:RPS, author = "Haopeng Zhang and Yanlei Diao and Neil Immerman", title = "Recognizing patterns in streams with imprecise timestamps", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "244--255", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Neumann:2010:XRF, author = "Thomas Neumann and Gerhard Weikum", title = "{x-RDF-3X}: fast querying, high update rates, and consistency for {RDF} databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "256--263", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2010:GPM, author = "Wenfei Fan and Jianzhong Li and Shuai Ma and Nan Tang and Yinghui Wu and Yunpeng Wu", title = "Graph pattern matching: from intractable to polynomial time", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "264--275", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yildirim:2010:GSR, author = "Hilmi Yildirim and Vineet Chaoji and Mohammed J. Zaki", title = "{GRAIL}: scalable reachability index for large graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "276--284", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bu:2010:HEI, author = "Yingyi Bu and Bill Howe and Magdalena Balazinska and Michael D. Ernst", title = "{HaLoop}: efficient iterative data processing on large clusters", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "285--296", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Benedikt:2010:IVV, author = "Michael Benedikt and Georg Gottlob", title = "The impact of virtual views on containment", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "297--308", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Terwilliger:2010:UET, author = "James F. Terwilliger and Lois M. L. Delcambre and David Maier and Jeremy Steinhauer and Scott Britell", title = "Updatable and evolvable transforms for virtual databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "309--319", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deutch:2010:NCM, author = "Daniel Deutch and Ohad Greenshpan and Tova Milo", title = "Navigating in complex mashed-up applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "320--329", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Melnik:2010:DIA, author = "Sergey Melnik and Andrey Gubarev and Jing Jing Long and Geoffrey Romer and Shiva Shivakumar and Matt Tolton and Theo Vassilakis", title = "{Dremel}: interactive analysis of {Web}-scale datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "330--339", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhao:2010:GQO, author = "Peixiang Zhao and Jiawei Han", title = "On graph query optimization in large networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "340--351", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Martinenghi:2010:PRJ, author = "Davide Martinenghi and Marco Tagliasacchi", title = "Proximity rank join", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "352--363", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Vlachou:2010:IMI, author = "Akrivi Vlachou and Christos Doulkeridis and Kjetil N{\o}rv{\aa}g and Yannis Kotidis", title = "Identifying the most influential data objects with reverse top-$k$ queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "364--372", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2010:RTP, author = "Xin Cao and Gao Cong and Christian S. Jensen", title = "Retrieving top-$k$ prestige-based relevant spatial {Web} objects", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "373--384", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2010:PLF, author = "Lei Li and B. Aditya Prakash and Christos Faloutsos", title = "Parsimonious linear fingerprinting for time series", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "385--396", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2010:HTM, author = "Rui Zhang and Martin Stradling", title = "The {HV-tree}: a memory hierarchy aware version index", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "397--408", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pramanik:2010:TRQ, author = "Sakti Pramanik and Alok Watve and Chad R. Meiners and Alex Liu", title = "Transforming range queries to equivalent box queries to optimize page access", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "409--416", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Guo:2010:RLU, author = "Songtao Guo and Xin Luna Dong and Divesh Srivastava and Remi Zajac", title = "Record linkage with uniqueness constraints and erroneous values", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "417--428", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ioannou:2010:FEA, author = "Ekaterini Ioannou and Wolfgang Nejdl and Claudia Nieder{\'e}e and Yannis Velegrakis", title = "On-the-fly entity-aware query processing in the presence of linkage", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "429--438", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yakout:2010:BBR, author = "Mohamed Yakout and Ahmed K. Elmagarmid and Hazem Elmeleegy and Mourad Ouzzani and Alan Qi", title = "Behavior based record linkage", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "439--448", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Han:2010:IFC, author = "Wook-Shin Han and Jinsoo Lee and Minh-Duc Pham and Jeffrey Xu Yu", title = "{iGraph}: a framework for comparisons of disk-based graph indexing techniques", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "449--459", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Schad:2010:RMC, author = "J{\"o}rg Schad and Jens Dittrich and Jorge-Arnulfo Quian{\'e}-Ruiz", title = "Runtime measurements in the cloud: observing, analyzing, and reducing variance", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "460--471", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jiang:2010:PMD, author = "Dawei Jiang and Beng Chin Ooi and Lei Shi and Sai Wu", title = "The performance of {MapReduce}: an in-depth study", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "472--483", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kopcke:2010:EER, author = "Hanna K{\"o}pcke and Andreas Thor and Erhard Rahm", title = "Evaluation of entity resolution approaches on real-world match problems", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "484--493", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nykiel:2010:MSA, author = "Tomasz Nykiel and Michalis Potamias and Chaitanya Mishra and George Kollios and Nick Koudas", title = "{MRShare}: sharing across multiple queries in {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "494--505", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Vo:2010:TET, author = "Hoang Tam Vo and Chun Chen and Beng Chin Ooi", title = "Towards elastic transactional cloud storage with range query support", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "506--514", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dittrich:2010:HMY, author = "Jens Dittrich and Jorge-Arnulfo Quian{\'e}-Ruiz and Alekh Jindal and Yagiz Kargin and Vinay Setty and J{\"o}rg Schad", title = "{Hadoop++}: making a yellow elephant run like a cheetah (without it even noticing)", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "515--529", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bruno:2010:SLR, author = "Nicolas Bruno and Vivek Narasayya and Ravi Ramamurthy", title = "Slicing long-running queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "530--541", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tzoumas:2010:SAH, author = "Kostas Tzoumas and Amol Deshpande and Christian S. Jensen", title = "Sharing-aware horizontal partitioning for exploiting correlations during query processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "542--553", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cali:2010:APO, author = "Andrea Cal{\`\i} and Georg Gottlob and Andreas Pieris", title = "Advanced processing for ontological queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "554--565", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Parameswaran:2010:TWC, author = "Aditya Parameswaran and Hector Garcia-Molina and Anand Rajaraman", title = "Towards the {Web} of concepts: extracting concepts from large datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "566--577", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gulhane:2010:ECR, author = "Pankaj Gulhane and Rajeev Rastogi and Srinivasan H. Sengamedu and Ashwin Tengli", title = "Exploiting content redundancy for {Web} information extraction", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "578--587", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2010:ARR, author = "Bin Liu and Laura Chiticariu and Vivian Chu and H. V. Jagadish and Frederick R. Reiss", title = "Automatic rule refinement for information extraction", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "588--597", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pang:2010:ETS, author = "HweeHwa Pang and Xuhua Ding and Xiaokui Xiao", title = "Embellishing text search queries to protect user privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "598--607", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chaytor:2010:SDR, author = "Rhonda Chaytor and Ke Wang", title = "Small domain randomization: same privacy, more utility", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "608--618", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Papadopoulos:2010:NNS, author = "Stavros Papadopoulos and Spiridon Bakiras and Dimitris Papadias", title = "Nearest neighbor search with strong location privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "619--629", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kimura:2010:UPI, author = "Hideaki Kimura and Samuel Madden and Stanley B. Zdonik", title = "{UPI}: a primary index for uncertain databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "630--637", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2010:RCP, author = "Jian Li and Amol Deshpande", title = "Ranking continuous probabilistic datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "638--649", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lian:2010:SSJ, author = "Xiang Lian and Lei Chen", title = "Set similarity join on probabilistic data", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "650--659", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Woods:2010:CED, author = "Louis Woods and Jens Teubner and Gustavo Alonso", title = "Complex event detection at wire speed with {FPGAs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "660--669", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fang:2010:DCG, author = "Wenbin Fang and Bingsheng He and Qiong Luo", title = "Database compression on graphics processors", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "670--680", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Johnson:2010:ASA, author = "Ryan Johnson and Ippokratis Pandis and Radu Stoica and Manos Athanassoulis and Anastasia Ailamaki", title = "{Aether}: a scalable approach to logging", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "681--692", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Macropol:2010:SDB, author = "Kathy Macropol and Ambuj Singh", title = "Scalable discovery of best clusters on large graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "693--702", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Smola:2010:APT, author = "Alexander Smola and Shravan Narayanamurthy", title = "An architecture for parallel topic models", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "703--710", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ganti:2010:KFI, author = "Venkatesh Ganti and Yeye He and Dong Xin", title = "{Keyword++}: a framework to improve keyword search over entity databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "711--722", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2010:SMR, author = "Zhenhui Li and Bolin Ding and Jiawei Han and Roland Kays", title = "{Swarm}: mining relaxed temporal moving object clusters", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "723--734", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2010:AUP, author = "Su Chen and Beng Chin Ooi and Zhenjie Zhang", title = "An adaptive updating protocol for reducing moving object database workload", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "735--746", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kellaris:2010:SPC, author = "Georgios Kellaris and Kyriakos Mouratidis", title = "Shortest path computation on air indexes", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "747--757", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xu:2010:EES, author = "Jia Xu and Zhenjie Zhang and Anthony K. H. Tung and Ge Yu", title = "Efficient and effective similarity search over probabilistic data based on {Earth Mover's Distance}", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "758--769", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Benedikt:2010:PXM, author = "Michael Benedikt and Evgeny Kharlamov and Dan Olteanu and Pierre Senellart", title = "Probabilistic {XML} via {Markov Chains}", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "770--781", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Arumugam:2010:MRR, author = "Subi Arumugam and Fei Xu and Ravi Jampani and Christopher Jermaine and Luis L. Perez and Peter J. Haas", title = "{MCDB-R}: risk analysis in the database", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "782--793", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wick:2010:SPD, author = "Michael Wick and Andrew McCallum and Gerome Miklau", title = "Scalable probabilistic databases with factor graphs and {MCMC}", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "794--804", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2010:MCF, author = "Meihui Zhang and Marios Hadjieleftheriou and Beng Chin Ooi and Cecilia M. Procopiuc and Divesh Srivastava", title = "On multi-column foreign key discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "805--814", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cheng:2010:EEE, author = "Reynold Cheng and Eric Lo and Xuan S. Yang and Ming-Hay Luk and Xiang Li and Xike Xie", title = "Explore or exploit?: effective strategies for disambiguating large databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "815--825", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Soliman:2010:BRM, author = "Mohamed A. Soliman and Ihab F. Ilyas and Mina Saleeb", title = "Building ranked mashups of unstructured sources with uncertain information", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "826--837", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Raissi:2010:CCS, author = "Chedy Ra{\"\i}ssi and Jian Pei and Thomas Kister", title = "Computing closed skycubes", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "838--847", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lo:2010:GDQ, author = "Eric Lo and Nick Cheng and Wing-Kai Hon", title = "Generating databases for query workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "848--859", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2010:PTJ, author = "Minji Wu and Laure Berti-{\'E}quille and Am{\'e}lie Marian and Cecilia M. Procopiuc and Divesh Srivastava", title = "Processing top-$k$ join queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "860--870", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Martinez-Palau:2010:TWR, author = "Xavier Martinez-Palau and David Dominguez-Sal and Josep Lluis Larriba-Pey", title = "Two-way replacement selection", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "871--881", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Maneth:2010:XWQ, author = "Sebastian Maneth and Kim Nguyen", title = "{XPath} whole query optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "882--893", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Grimsmo:2010:FOT, author = "Nils Grimsmo and Truls A. Bj{\o}rklund and Magnus Lie Hetland", title = "Fast optimal twig joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "894--905", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Benedikt:2010:DIX, author = "Michael Benedikt and James Cheney", title = "Destabilizers and independence of {XML} updates", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "906--917", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2010:SWH, author = "Ziyang Liu and Qihong Shao and Yi Chen", title = "Searching workflows with hierarchical views", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "918--927", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pandis:2010:DOT, author = "Ippokratis Pandis and Ryan Johnson and Nikos Hardavellas and Anastasia Ailamaki", title = "Data-oriented transaction execution", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "928--939", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deutch:2010:OTQ, author = "Daniel Deutch and Tova Milo and Neoklis Polyzotis and Tom Yam", title = "Optimal top-$k$ query evaluation for weighted business processes", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "940--951", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2010:BSM, author = "Guozhang Wang and Marcos Vaz Salles and Benjamin Sowell and Xun Wang and Tuan Cao and Alan Demers and Johannes Gehrke and Walker White", title = "Behavioral simulations in {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "952--963", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ge:2010:TSS, author = "Tingjian Ge and Stan Zdonik", title = "{A*-tree}: a structure for storage and modeling of uncertain multidimensional arrays", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "964--974", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Aggarwal:2010:DPM, author = "Charu C. Aggarwal and Yao Li and Philip S. Yu and Ruoming Jin", title = "On dense pattern mining in graph streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "975--984", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yiu:2010:EPD, author = "Man Lung Yiu and Leong Hou U. and Simonas Saltenis and Kostas Tzoumas", title = "Efficient proximity detection among mobile users via self-tuning policies", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "985--996", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Potamias:2010:KNN, author = "Michalis Potamias and Francesco Bonchi and Aristides Gionis and George Kollios", title = "k-nearest neighbors in uncertain graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "997--1008", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2010:MSS, author = "Xin Cao and Gao Cong and Christian S. Jensen", title = "Mining significant semantic locations from {GPS} data", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1009--1020", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hay:2010:BAD, author = "Michael Hay and Vibhor Rastogi and Gerome Miklau and Dan Suciu", title = "Boosting the accuracy of differentially private histograms through consistency", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1021--1032", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2010:UIP, author = "Jianneng Cao and Panagiotis Karras and Chedy Ra{\"\i}ssi and Kian-Lee Tan", title = "$ \rho $-uncertainty: inference-proof transaction anonymization", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1033--1044", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cormode:2010:MMM, author = "Graham Cormode and Divesh Srivastava and Ninghui Li and Tiancheng Li", title = "Minimizing minimality and maximizing utility: analyzing method-based attacks on anonymized data", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1045--1056", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2010:QPI, author = "Daisy Zhe Wang and Michael J. Franklin and Minos Garofalakis and Joseph M. Hellerstein", title = "Querying probabilistic information extraction", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1057--1067", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sen:2010:ROF, author = "Prithviraj Sen and Amol Deshpande and Lise Getoor", title = "Read-once functions and query evaluation in probabilistic databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1068--1079", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Agrawal:2010:FUD, author = "Parag Agrawal and Anish Das Sarma and Jeffrey Ullman and Jennifer Widom", title = "Foundations of uncertain-data integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1080--1090", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mathioudakis:2010:IAD, author = "Michael Mathioudakis and Nilesh Bansal and Nick Koudas", title = "Identifying, attributing and describing spatial bursts", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1091--1102", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kimura:2010:CCA, author = "Hideaki Kimura and George Huo and Alexander Rasin and Samuel Madden and Stanley B. Zdonik", title = "{CORADD}: correlation aware database designer for materialized views and indexes", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1103--1113", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nanongkai:2010:RMR, author = "Danupon Nanongkai and Atish Das Sarma and Ashwin Lall and Richard J. Lipton and Jun Xu", title = "Regret-minimizing representative databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1114--1124", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Arai:2010:ACA, author = "Benjamin Arai and Gautam Das and Dimitrios Gunopulos and Vagelis Hristidis and Nick Koudas", title = "An access cost-aware approach for object retrieval over multiple sources", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1125--1136", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abhirama:2010:SPC, author = "M. Abhirama and Sourjya Bhaumik and Atreyee Dey and Harsh Shrimal and Jayant R. Haritsa", title = "On the stability of plan costs and the costs of plan stability", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1137--1148", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Herodotou:2010:XST, author = "Herodotos Herodotou and Shivnath Babu", title = "{Xplus}: a {SQL}-tuning-aware query optimizer", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1149--1160", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2010:GHR, author = "Wenfei Fan and Jianzhong Li and Shuai Ma and Hongzhi Wang and Yinghui Wu", title = "Graph homomorphism revisited for graph matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1161--1172", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kandhan:2010:SFS, author = "Ramakrishnan Kandhan and Nikhil Teletia and Jignesh M. Patel", title = "{SigMatch}: fast and scalable multi-pattern matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1173--1184", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2010:SSI, author = "Shijie Zhang and Jiong Yang and Wei Jin", title = "{SAPPER}: subgraph indexing and approximate matching in large graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1185--1194", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2010:TIS, author = "Yinan Li and Bingsheng He and Robin Jun Yang and Qiong Luo and Ke Yi", title = "Tree indexing on solid state drives", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1195--1206", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2010:EBT, author = "Sai Wu and Dawei Jiang and Beng Chin Ooi and Kun-Lung Wu", title = "Efficient {B-tree} based indexing for cloud data processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1207--1218", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2010:TJE, author = "Jiannan Wang and Jianhua Feng and Guoliang Li", title = "{Trie-join}: efficient trie-based string similarity joins with edit-distance constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1219--1230", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sharifzadeh:2010:VTR, author = "Mehdi Sharifzadeh and Cyrus Shahabi", title = "{VoR-tree}: {R-trees} with {Voronoi} diagrams for efficient processing of spatial nearest neighbor queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1231--1242", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deepak:2010:ERR, author = "P. Deepak and Prasad M. Deshpande", title = "Efficient {RkNN} retrieval with arbitrary non-metric similarity measures", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1243--1254", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2010:ESE, author = "Shiming Zhang and Nikos Mamoulis and David W. Cheung and Ben Kao", title = "Efficient skyline evaluation over partially ordered domains", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1255--1266", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wei:2010:AHO, author = "Mingzhu Wei and Elke A. Rundensteiner and Murali Mani", title = "Achieving high output quality under limited resources through structure-based spilling in {XML} streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1267--1278", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mihaylov:2010:DJO, author = "Svilen R. Mihaylov and Marie Jacob and Zachary G. Ives and Sudipto Guha", title = "Dynamic join optimization in multi-hop wireless sensor networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1279--1290", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Akdere:2010:DSC, author = "Mert Akdere and U{\u{g}}ur {\c{C}}etintemel and Eli Upfal", title = "Database-support for continuous prediction queries over streaming data", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1291--1301", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tran:2010:CAU, author = "Thanh T. L. Tran and Andrew McGregor and Yanlei Diao and Liping Peng and Anna Liu", title = "Conditioning and aggregating uncertain data streams: going beyond expectations", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1302--1313", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Glavic:2010:TUB, author = "Boris Glavic and Gustavo Alonso and Ren{\'e}e J. Miller and Laura M. Haas", title = "{TRAMP}: understanding the behavior of schema mappings through provenance", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1314--1325", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Whang:2010:ERE, author = "Steven Euijong Whang and Hector Garcia-Molina", title = "Entity resolution with evolving rules", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1326--1337", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Limaye:2010:ASW, author = "Girija Limaye and Sunita Sarawagi and Soumen Chakrabarti", title = "Annotating and searching {Web} tables using entities, types and relationships", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1338--1347", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bedathur:2010:IPM, author = "Srikanta Bedathur and Klaus Berberich and Jens Dittrich and Nikos Mamoulis and Gerhard Weikum", title = "Interesting-phrase mining for ad-hoc text analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1348--1357", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dong:2010:GDC, author = "Xin Luna Dong and Laure Berti-Equille and Yifan Hu and Divesh Srivastava", title = "Global detection of complex copying relationships between sources", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1358--1369", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{DeCapitanidiVimercati:2010:FLA, author = "Sabrina {De Capitani di Vimercati} and Sara Foresti and Sushil Jajodia and Stefano Paraboschi and Pierangela Samarati", title = "Fragments and loose associations: respecting privacy in data publishing", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1370--1381", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fusco:2010:NFF, author = "Francesco Fusco and Marc Ph. Stoecklin and Michail Vlachos", title = "{NET-FLi}: on-the-fly compression, archiving and indexing of streaming network traffic", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1382--1393", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zou:2010:SRQ, author = "Qiong Zou and Huayong Wang and Robert Soul{\'e} and Martin Hirzel and Henrique Andrade and Bu{\u{g}}ra Gedik and Kun-Lung Wu", title = "From a stream of relational queries to distributed stream processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1394--1405", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mah:2010:UUA, author = "James T. L. Mah and Danny C. C. Poo and Shaojiang Cai", title = "{UASMAs} (universal automated {SNP} mapping algorithms): a set of algorithms to instantaneously map {SNPs} in real time to aid functional {SNP} discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1406--1413", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Debnath:2010:FHT, author = "Biplob Debnath and Sudipta Sengupta and Jin Li", title = "{FlashStore}: high throughput persistent key--value store", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1414--1425", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xin:2010:MDA, author = "Reynold S. Xin and William McLaren and Patrick Dantressangle and Steve Schormann and Sam Lightstone and Maria Schwenger", title = "{MEET DB2}: automated database migration evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1426--1434", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Canim:2010:SBE, author = "Mustafa Canim and George A. Mihaila and Bishwaranjan Bhattacharjee and Kenneth A. Ross and Christian A. Lang", title = "{SSD} bufferpool extensions for database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1435--1446", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Loboz:2010:DWM, author = "Charles Loboz and Slawek Smyl and Suman Nath", title = "{DataGarage}: warehousing massive performance data on commodity servers", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1447--1458", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2010:CHP, author = "Songting Chen", title = "{Cheetah}: a high performance, custom data warehouse on top of {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1459--1468", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Orair:2010:DBO, author = "Gustavo H. Orair and Carlos H. C. Teixeira Wagner {Meira, Jr.} and Ye Wang and Srinivasan Parthasarathy", title = "Distance-based outlier detection: consolidation and renewed bearing", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1469--1480", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kim:2010:ALM, author = "Young-Seok Kim and Heegyu Jin and Kyoung-Gu Woo", title = "Adaptive logging for mobile device", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1481--1492", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pesti:2010:RSL, author = "Peter Pesti and Ling Liu and Bhuvan Bamba and Arun Iyengar and Matt Weber", title = "{RoadTrack}: scaling location updates for mobile clients on road networks with query awareness", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1493--1504", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Si:2010:CID, author = "Xiance Si and Edward Y. Chang and Zolt{\'a}n Gy{\"o}ngyi and Maosong Sun", title = "{Confucius} and its intelligent disciples: integrating social with search", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1505--1516", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Haritsa:2010:PDQ, author = "Jayant R. Haritsa", title = "The {Picasso} database query optimizer visualizer", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1517--1520", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2010:CED, author = "Ziyang Liu and Sivaramakrishnan Natarajan and Bin He and Hui-I Hsiao and Yi Chen", title = "{CODS}: evolving data efficiently and scalably in column oriented databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1521--1524", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sadoghi:2010:EEP, author = "Mohammad Sadoghi and Martin Labrecque and Harsh Singh and Warren Shum and Hans-Arno Jacobsen", title = "Efficient event processing through reconfigurable hardware for algorithmic trading", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1525--1528", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Levandoski:2010:CCP, author = "Justin J. Levandoski and Mohamed F. Mokbel and Mohamed E. Khalefa", title = "{CareDB}: a context and preference-aware location-based database system", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1529--1532", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kossmann:2010:CMC, author = "Donald Kossmann and Tim Kraska and Simon Loesing and Stephan Merkli and Raman Mittal and Flavio Pfaffhauser", title = "{Cloudy}: a modular cloud storage system", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1533--1536", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kazemitabar:2010:GSQ, author = "Seyed Jalal Kazemitabar and Ugur Demiryurek and Mohamed Ali and Afsin Akdogan and Cyrus Shahabi", title = "Geospatial stream query processing using {Microsoft SQL Server StreamInsight}", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1537--1540", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dyreson:2010:UXT, author = "Curtis E. Dyreson and Sourav S. Bhowmick and Kirankanth Mallampalli", title = "Using {XMorph} to transform {XML} data", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1541--1544", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2010:ACE, author = "Di Wang and Elke A. Rundensteiner and Han Wang and Richard T. {Ellison III}", title = "Active complex event processing: applications in real-time health care", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1545--1548", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Schreiber:2010:TNP, author = "Tom Schreiber and Simone Bonetti and Torsten Grust and Manuel Mayr and Jan Rittinger", title = "Thirteen new players in the team: a {FERRY}-based {LINQ} to {SQL} provider", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1549--1552", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abiteboul:2010:AEC, author = "Serge Abiteboul and Pierre Bourhis and Bogdan Marinoiu and Alban Galland", title = "{AXART}: enabling collaborative work with {AXML} artifacts", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1553--1556", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{McConnell:2010:IAF, author = "Christopher McConnell and Fan Ping and Jeong-Hyon Hwang", title = "{iFlow}: an approach for fast and reliable {Internet-scale} stream processing utilizing detouring and replication", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1557--1560", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kantere:2010:PCT, author = "Verena Kantere and Maher Manoubi and Iluju Kiringa and Timos Sellis and John Mylopoulos", title = "Peer coordination through distributed triggers", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1561--1564", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2010:SSY, author = "Hao Wu and Guoliang Li and Chen Li and Lizhu Zhou", title = "{Seaform}: search-as-you-type in forms", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1565--1568", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Strotgen:2010:TSE, author = "Jannik Str{\"o}tgen and Michael Gertz", title = "{TimeTrails}: a system for exploring spatio-temporal information in documents", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1569--1572", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pound:2010:QEF, author = "Jeffrey Pound and Ihab F. Ilyas and Grant Weddell", title = "{QUICK}: expressive and flexible search over knowledge bases and text collections", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1573--1576", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kwietniewski:2010:TXD, author = "Marcin Kwietniewski and Jarek Gryz and Stephanie Hazlewood and Paul {Van Run}", title = "Transforming {XML} documents as schemas evolve", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1577--1580", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2010:XCT, author = "Ziyang Liu and Sivaramakrishnan Natarajan and Peng Sun and Stephen Booher and Tim Meehan and Robert Winkler and Yi Chen", title = "{XSACT}: a comparison tool for structured search results", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1581--1584", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abdessalem:2010:OLT, author = "Talel Abdessalem and Bogdan Cautis and Nora Derouiche", title = "{ObjectRunner}: lightweight, targeted extraction and querying of structured {Web} data", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1585--1588", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Elbassuoni:2010:RRW, author = "Shady Elbassuoni and Katja Hose and Steffen Metzger and Ralf Schenkel", title = "{ROXXI}: {Reviving} witness {dOcuments} to {eXplore eXtracted Information}", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1589--1592", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Termehchy:2010:EUD, author = "Arash Termehchy and Marianne Winslett", title = "{EXTRUCT}: using deep structural information in {XML} keyword search", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1593--1596", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Akbarnejad:2010:SQR, author = "Javad Akbarnejad and Gloria Chatzopoulou and Magdalini Eirinaki and Suju Koshy and Sarika Mittal and Duc On and Neoklis Polyzotis and Jothi S. Vindhiya Varman", title = "{SQL QueRIE} recommendations", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1597--1600", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ang:2010:PCM, author = "Hock Hee Ang and Vivekanand Gopalkrishnan and Wee Keong Ng and Steven C. H. Hoi", title = "{P2PDocTagger}: content management through automated {P2P} collaborative tagging", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1601--1604", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Setty:2010:IEI, author = "Vinay Setty and Srikanta Bedathur and Klaus Berberich and Gerhard Weikum", title = "{InZeit}: efficiently identifying insightful time points", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1605--1608", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sun:2010:IIT, author = "Aixin Sun and Sourav S. Bhowmick and Yao Liu", title = "{iAVATAR}: an interactive tool for finding and visualizing visual-representative tags in image search", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1609--1612", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kabisch:2010:DWI, author = "Thomas Kabisch and Eduard C. Dragut and Clement Yu and Ulf Leser", title = "Deep {Web} integration with {VisQI}", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1613--1616", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dong:2010:SST, author = "Xin Luna Dong and Laure Berti-Equille and Yifan Hu and Divesh Srivastava", title = "{SOLOMON}: seeking the truth via copying detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1617--1620", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hentschel:2010:JTD, author = "Martin Hentschel and Laura Haas and Ren{\'e}e J. Miller", title = "Just-in-time data integration in action", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1621--1624", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Alexandrov:2010:MPD, author = "Alexander Alexandrov and Max Heimel and Volker Markl and Dominic Battr{\'e} and Fabian Hueske and Erik Nijkamp and Stephan Ewen and Odej Kao and Daniel Warneke", title = "Massively parallel data analysis with {PACTs} on {Nephele}", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1625--1628", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Middelfart:2010:UST, author = "Morten Middelfart and Torben Bach Pedersen", title = "Using sentinel technology in the {TARGIT BI} suite", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1629--1632", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gunnemann:2010:CIC, author = "Stephan G{\"u}nnemann and Ines F{\"a}rber and Hardy Kremer and Thomas Seidl", title = "{CoDA}: interactive cluster based concept discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1633--1636", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bergamaschi:2010:KSK, author = "Sonia Bergamaschi and Elton Domnori and Francesco Guerra and Mirko Orsini and Raquel Trillo Lado and Yannis Velegrakis", title = "{Keymantic}: semantic keyword-based searching in data integration systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1637--1640", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Golab:2010:DAE, author = "Lukasz Golab and Howard Karloff and Flip Korn and Divesh Srivastava", title = "Data {Auditor}: exploring data quality and semantics using pattern tableaux", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1641--1644", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nori:2010:DCP, author = "Anil K. Nori", title = "Distributed caching platforms", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1645--1646", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Agrawal:2010:BDC, author = "Divyakant Agrawal and Sudipto Das and Amr {El Abbadi}", title = "Big data and cloud computing: new wine or just new bottles?", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1647--1648", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Samet:2010:TSS, author = "Hanan Samet", title = "Techniques for similarity searching in multimedia databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1649--1650", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Etzion:2010:EPP, author = "Opher Etzion", title = "Event processing: past, present and future", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1651--1652", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Renz:2010:SSM, author = "Matthias Renz and Reynold Cheng and Hans-Peter Kriegel", title = "Similarity search and mining in uncertain databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1653--1654", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Muthukrishnan:2010:DMM, author = "S. Muthukrishnan", title = "Data management and mining in {Internet AD} systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "3", number = "1--2", pages = "1655--1656", month = sep, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:02 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kling:2010:GEE, author = "Patrick Kling and M. Tamer {\"O}zsu and Khuzaima Daudjee", title = "Generating efficient execution plans for vertically partitioned {XML} databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "1", pages = "1--11", month = oct, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:15 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lian:2010:GFH, author = "Xiang Lian and Lei Chen", title = "A generic framework for handling uncertain data with local correlations", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "1", pages = "12--21", month = oct, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:15 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Khoussainova:2010:SCA, author = "Nodira Khoussainova and YongChul Kwon and Magdalena Balazinska and Dan Suciu", title = "{SnipSuggest}: context-aware autocompletion for {SQL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "1", pages = "22--33", month = oct, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:15 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Meliou:2010:CCR, author = "Alexandra Meliou and Wolfgang Gatterbauer and Katherine F. Moore and Dan Suciu", title = "The complexity of causality and responsibility for query answers and non-answers", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "1", pages = "34--45", month = oct, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:15 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sagy:2010:DTQ, author = "Guy Sagy and Daniel Keren and Izchak Sharfman and Assaf Schuster", title = "Distributed threshold querying of general functions by a difference of monotonic representation", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "2", pages = "46--57", month = nov, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:15 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2010:TBD, author = "Nan Wang and Jingbo Zhang and Kian-Lee Tan and Anthony K. H. Tung", title = "On triangulation-based dense neighborhood graph discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "2", pages = "58--68", month = nov, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:15 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rice:2010:GIR, author = "Michael Rice and Vassilis J. Tsotras", title = "Graph indexing of road networks for shortest path queries with label restrictions", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "2", pages = "69--80", month = nov, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:15 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qian:2010:CUF, author = "Li Qian and Kristen LeFevre and H. V. Jagadish", title = "{CRIUS}: user-friendly database design", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "2", pages = "81--92", month = nov, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:15 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rocha-Junior:2010:EPT, author = "Jo{\~a}o B. Rocha-Junior and Akrivi Vlachou and Christos Doulkeridis and Kjetil N{\o}rv{\aa}g", title = "Efficient processing of top-$k$ spatial preference queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "2", pages = "93--104", month = nov, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:15 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Grund:2010:HMM, author = "Martin Grund and Jens Kr{\"u}ger and Hasso Plattner and Alexander Zeier and Philippe Cudre-Mauroux and Samuel Madden", title = "{HYRISE}: a main memory hybrid storage engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "2", pages = "105--116", month = nov, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:15 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Curino:2010:URI, author = "Carlo A. Curino and Hyun Jin Moon and Alin Deutsch and Carlo Zaniolo", title = "Update rewriting and integrity constraint maintenance in a schema evolution support system: {PRISM++}", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "2", pages = "117--128", month = nov, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:15 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Oro:2010:SEX, author = "Ermelinda Oro and Massimo Ruffolo and Steffen Staab", title = "{SXPath}: extending {XPath} towards spatial querying on {Web} documents", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "2", pages = "129--140", month = nov, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:15 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yuan:2010:PPP, author = "Mingxuan Yuan and Lei Chen and Philip S. Yu", title = "Personalized privacy protection in social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "2", pages = "141--150", month = nov, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:15 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Toda:2010:PAA, author = "Guilherme A. Toda and Eli Cortez and Altigran S. da Silva and Edleno de Moura", title = "A probabilistic approach for automatically filling form-based {Web} interfaces", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "3", pages = "151--160", month = dec, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:16 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Papadimitriou:2010:OUB, author = "Panagiotis Papadimitriou and Hector Garcia-Molina and Ali Dasdan and Santanu Kolay", title = "Output {URL} bidding", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "3", pages = "161--172", month = dec, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:16 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bahmani:2010:FIP, author = "Bahman Bahmani and Abdur Chowdhury and Ashish Goel", title = "Fast incremental and personalized {PageRank}", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "3", pages = "173--184", month = dec, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:16 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we analyze the efficiency of Monte Carlo methods for incremental computation of PageRank, personalized PageRank, and similar random walk based methods (with focus on SALSA), on large-scale dynamically evolving social networks. We assume that the graph of friendships is stored in distributed shared memory, as is the case for large social networks such as Twitter.\par For global PageRank, we assume that the social network has $n$ nodes, and $m$ adversarially chosen edges arrive in a random order. We show that with a reset probability of $ \epsilon $, the expected total work needed to maintain an accurate estimate (using the Monte Carlo method) of the PageRank of every node at all times is $ O(n \ln m / \epsilon^2)$. This is significantly better than all known bounds for incremental PageRank. For instance, if we naively recompute the PageRanks as each edge arrives, the simple power iteration method needs $ \Omega (m^2 / \ln (1 / (1 - \epsilon)))$ total time and the Monte Carlo method needs $ O(m n / \epsilon)$ total time; both are prohibitively expensive. We also show that we can handle deletions equally efficiently.\par We then study the computation of the top $k$ personalized PageRanks starting from a seed node, assuming that personalized PageRanks follow a power-law with exponent $ < 1$. We show that if we store $ R > q \ln n$ random walks starting from every node for large enough constant $q$ (using the approach outlined for global PageRank), then the expected number of calls made to the distributed social network database is $ O(k / (R^{(1 - \alpha) / \alpha }))$. We also present experimental results from the social networking site, Twitter, verifying our assumptions and analyses. The overall result is that this algorithm is fast enough for real-time queries over a dynamic social network.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lee:2010:QES, author = "Jongwuk Lee and Seung-won Hwang", title = "{QSkycube}: efficient skycube computation using point-based space partitioning", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "3", pages = "185--196", month = dec, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:16 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2010:ZEI, author = "Bin Liu and Chee-Yong Chan", title = "{ZINC}: efficient indexing for skyline computation", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "3", pages = "197--207", month = dec, year = "2010", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:16 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rastogi:2011:LSC, author = "Vibhor Rastogi and Nilesh Dalvi and Minos Garofalakis", title = "Large-scale collective entity matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "4", pages = "208--218", month = jan, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:17 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dalvi:2011:AWL, author = "Nilesh Dalvi and Ravi Kumar and Mohamed Soliman", title = "Automatic wrappers for large scale {Web} extraction", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "4", pages = "219--230", month = jan, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:17 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2011:FSM, author = "Xintian Yang and Srinivasan Parthasarathy and P. Sadayappan", title = "Fast sparse matrix-vector multiplication on {GPUs}: implications for graph mining", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "4", pages = "231--242", month = jan, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:17 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rao:2011:UPB, author = "Jun Rao and Eugene J. Shekita and Sandeep Tata", title = "Using {Paxos} to build a scalable, consistent, and highly available datastore", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "4", pages = "243--254", month = jan, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:17 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ding:2011:FSI, author = "Bolin Ding and Arnd Christian K{\"o}nig", title = "Fast set intersection in memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "4", pages = "255--266", month = jan, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:17 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Parameswaran:2011:HAG, author = "Aditya Parameswaran and Anish Das Sarma and Hector Garcia-Molina and Neoklis Polyzotis and Jennifer Widom", title = "Human-assisted graph search: it's okay to ask questions", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "5", pages = "267--278", month = feb, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:18 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yakout:2011:GDR, author = "Mohamed Yakout and Ahmed K. Elmagarmid and Jennifer Neville and Mourad Ouzzani and Ihab F. Ilyas", title = "Guided data repair", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "5", pages = "279--289", month = feb, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:18 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Venetis:2011:HLD, author = "Petros Venetis and Hector Gonzalez and Christian S. Jensen and Alon Halevy", title = "Hyper-local, directions-based ranking of places", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "5", pages = "290--301", month = feb, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:18 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Koc:2011:IMC, author = "M. Levent Koc and Christopher R{\'e}", title = "Incrementally maintaining classification using an {RDBMS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "5", pages = "302--313", month = feb, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:18 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{He:2011:HTT, author = "Bingsheng He and Jeffrey Xu Yu", title = "High-throughput transaction executions on graphics processors", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "5", pages = "314--325", month = feb, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:18 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2011:DIQ, author = "Zhao Cao and Charles Sutton and Yanlei Diao and Prashant Shenoy", title = "Distributed inference and query processing for {RFID} tracking and monitoring", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "5", pages = "326--337", month = feb, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:55:18 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lee:2011:SJS, author = "Hongrae Lee and Raymond T. Ng and Kyuseok Shim", title = "Similarity join size estimation using locality sensitive hashing", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "6", pages = "338--349", month = mar, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:45:07 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2011:QEB, author = "Ziyang Liu and Sivaramakrishnan Natarajan and Yi Chen", title = "Query expansion based on clustered results", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "6", pages = "350--361", month = mar, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:45:07 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dash:2011:CSP, author = "Debabrata Dash and Neoklis Polyzotis and Anastasia Ailamaki", title = "{CoPhy}: a scalable, portable, and interactive index advisor for large workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "6", pages = "362--372", month = mar, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:45:07 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Niu:2011:TSS, author = "Feng Niu and Christopher R{\'e} and AnHai Doan and Jude Shavlik", title = "{Tuffy}: scaling up statistical inference in {Markov} logic networks using an {RDBMS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "6", pages = "373--384", month = mar, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:45:07 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jahani:2011:AOM, author = "Eaman Jahani and Michael J. Cafarella and Christopher R{\'e}", title = "Automatic optimization for {MapReduce} programs", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "6", pages = "385--396", month = mar, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:45:07 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2011:STG, author = "De-Nian Yang and Yi-Ling Chen and Wang-Chien Lee and Ming-Syan Chen", title = "On social-temporal group query with acquaintance constraint", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "6", pages = "397--408", month = mar, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 13 14:45:07 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nguyen:2011:SPO, author = "Hoa Nguyen and Ariel Fuxman and Stelios Paparizos and Juliana Freire and Rakesh Agrawal", title = "Synthesizing products for online catalogs", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "7", pages = "409--418", month = apr, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Jun 7 19:31:12 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Floratou:2011:COS, author = "Avrilia Floratou and Jignesh M. Patel and Eugene J. Shekita and Sandeep Tata", title = "Column-oriented storage techniques for {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "7", pages = "419--429", month = apr, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Jun 7 19:31:12 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lomet:2011:IPC, author = "David Lomet and Kostas Tzoumas and Michael Zwilling", title = "Implementing performance competitive logical recovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "7", pages = "430--439", month = apr, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Jun 7 19:31:12 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Machanavajjhala:2011:PSR, author = "Ashwin Machanavajjhala and Aleksandra Korolova and Atish Das Sarma", title = "Personalized social recommendations: accurate or private", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "7", pages = "440--450", month = apr, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Jun 7 19:31:12 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Capannini:2011:EDW, author = "Gabriele Capannini and Franco Maria Nardini and Raffaele Perego and Fabrizio Silvestri", title = "Efficient diversification of {Web} search results", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "7", pages = "451--459", month = apr, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Jun 7 19:31:12 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{DeFrancisciMorales:2011:SCM, author = "Gianmarco {De Francisci Morales} and Aristides Gionis and Mauro Sozio", title = "Social content matching in {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "7", pages = "460--469", month = apr, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Jun 7 19:31:12 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ao:2011:EPL, author = "Naiyong Ao and Fan Zhang and Di Wu and Douglas S. Stones and Gang Wang and Xiaoguang Liu and Jing Liu and Sheng Lin", title = "Efficient parallel lists intersection and index compression algorithms using graphics processing units", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "8", pages = "470--481", month = may, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:33 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zou:2011:GAS, author = "Lei Zou and Jinghui Mo and Lei Chen and M. Tamer {\"O}zsu and Dongyan Zhao", title = "{gStore}: answering {SPARQL} queries via subgraph matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "8", pages = "482--493", month = may, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:33 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Das:2011:ALE, author = "Sudipto Das and Shoji Nishimura and Divyakant Agrawal and Amr {El Abbadi}", title = "{Albatross}: lightweight elasticity in shared storage databases for the cloud using live data migration", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "8", pages = "494--505", month = may, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:33 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nutanong:2011:IHD, author = "Sarana Nutanong and Edwin H. Jacox and Hanan Samet", title = "An incremental {Hausdorff} distance calculation algorithm", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "8", pages = "506--517", month = may, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:33 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Blaustein:2011:SPP, author = "Barbara Blaustein and Adriane Chapman and Len Seligman and M. David Allen and Arnon Rosenthal", title = "Surrogate parenthood: protected and informative graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "8", pages = "518--525", month = may, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:33 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Venetis:2011:RST, author = "Petros Venetis and Alon Halevy and Jayant Madhavan and Marius Pasca and Warren Shen and Fei Wu and Gengxin Miao and Chung Wu", title = "Recovering semantics of tables on the web", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "9", pages = "528--538", month = jun, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:34 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Neumann:2011:ECE, author = "Thomas Neumann", title = "Efficiently compiling efficient query plans for modern hardware", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "9", pages = "539--550", month = jun, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:34 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jin:2011:DCR, author = "Ruoming Jin and Lin Liu and Bolin Ding and Haixun Wang", title = "Distance-constraint reachability computation in uncertain graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "9", pages = "551--562", month = jun, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:34 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chi:2011:IIC, author = "Yun Chi and Hyun Jin Moon and Hakan Hacig{\"u}m{\"u}s", title = "{iCBS}: incremental cost-based scheduling under piecewise linear {SLAs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "9", pages = "563--574", month = jun, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:34 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Eltabakh:2011:CFD, author = "Mohamed Y. Eltabakh and Yuanyuan Tian and Fatma {\"O}zcan and Rainer Gemulla and Aljoscha Krettek and John McPherson", title = "{CoHadoop}: flexible data placement and its exploitation in {Hadoop}", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "9", pages = "575--585", month = jun, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:34 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Idreos:2011:MWC, author = "Stratos Idreos and Stefan Manegold and Harumi Kuno and Goetz Graefe", title = "Merging what's cracked, cracking what's merged: adaptive indexing in main-memory column-stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "9", pages = "586--597", month = jun, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:34 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2011:PTR, author = "Chonghai Wang and Li Yan Yuan and Jia-Huai You and Osmar R. Zaiane and Jian Pei", title = "On pruning for top-$k$ ranking in uncertain databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "10", pages = "598--609", month = jul, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:34 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pandis:2011:PPL, author = "Ippokratis Pandis and Pinar T{\"o}z{\"u}n and Ryan Johnson and Anastasia Ailamaki", title = "{PLP}: page latch-free shared-everything {OLTP}", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "10", pages = "610--621", month = jul, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:34 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2011:EMH, author = "Jiannan Wang and Guoliang Li and Jeffrey Xu Yu and Jianhua Feng", title = "Entity matching: how similar is similar", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "10", pages = "622--633", month = jul, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:34 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2011:ACE, author = "Di Wang and Elke A. Rundensteiner and Richard T. {Ellison III}", title = "Active complex event processing over event streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "10", pages = "634--645", month = jul, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:34 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Budak:2011:STA, author = "Ceren Budak and Divyakant Agrawal and Amr {El Abbadi}", title = "Structural trend analysis for online social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "10", pages = "646--656", month = jul, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:34 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kimura:2011:CAP, author = "Hideaki Kimura and Vivek Narasayya and Manoj Syamala", title = "Compression aware physical database design", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "10", pages = "657--668", month = jul, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:34 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bernecker:2011:EPR, author = "Thomas Bernecker and Tobias Emrich and Hans-Peter Kriegel and Matthias Renz and Stefan Zankl and Andreas Z{\"u}fle", title = "Efficient probabilistic reverse nearest neighbor query processing on uncertain data", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "10", pages = "669--680", month = jul, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:34 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kargar:2011:KSG, author = "Mehdi Kargar and Aijun An", title = "Keyword search in graphs: finding $r$-cliques", journal = j-PROC-VLDB-ENDOWMENT, volume = "4", number = "10", pages = "681--692", month = jul, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 5 17:23:34 MDT 2011", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fabbri:2011:EBA, author = "Daniel Fabbri and Kristen LeFevre", title = "Explanation-based auditing", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "1", pages = "1--12", month = sep, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:06 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To comply with emerging privacy laws and regulations, it has become common for applications like electronic health records systems (EHRs) to collect access logs, which record each time a user (e.g., a hospital employee) accesses a piece of sensitive data (e.g., a patient record). Using the access log, it is easy to answer simple queries (e.g., Who accessed Alice's medical record?), but this often does not provide enough information. In addition to learning who accessed their medical records, patients will likely want to understand why each access occurred. In this paper, we introduce the problem of generating explanations for individual records in an access log. The problem is motivated by user-centric auditing applications, and it also provides a novel approach to misuse detection. We develop a framework for modeling explanations which is based on a fundamental observation: For certain classes of databases, including EHRs, the reason for most data accesses can be inferred from data stored elsewhere in the database. For example, if Alice has an appointment with Dr. Dave, this information is stored in the database, and it explains why Dr. Dave looked at Alice's record. Large numbers of data accesses can be explained using general forms called explanation templates. Rather than requiring an administrator to manually specify explanation templates, we propose a set of algorithms for automatically discovering frequent templates from the database (i.e., those that explain a large number of accesses). We also propose techniques for inferring collaborative user groups, which can be used to enhance the quality of the discovered explanations. Finally, we have evaluated our proposed techniques using an access log and data from the University of Michigan Health System. Our results demonstrate that in practice we can provide explanations for over 94\% of data accesses in the log.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Marcus:2011:HPS, author = "Adam Marcus and Eugene Wu and David Karger and Samuel Madden and Robert Miller", title = "Human-powered sorts and joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "1", pages = "13--24", month = sep, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:06 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Crowdsourcing markets like Amazon's Mechanical Turk (MTurk) make it possible to task people with small jobs, such as labeling images or looking up phone numbers, via a programmatic interface. MTurk tasks for processing datasets with humans are currently designed with significant reimplementation of common workflows and ad-hoc selection of parameters such as price to pay per task. We describe how we have integrated crowds into a declarative workflow engine called Qurk to reduce the burden on workflow designers. In this paper, we focus on how to use humans to compare items for sorting and joining data, two of the most common operations in DBMSs. We describe our basic query interface and the user interface of the tasks we post to MTurk. We also propose a number of optimizations, including task batching, replacing pairwise comparisons with numerical ratings, and pre-filtering tables before joining them, which dramatically reduce the overall cost of running sorts and joins on the crowd. In an experiment joining two sets of images, we reduce the overall cost from \$67 in a naive implementation to about \$3, without substantially affecting accuracy or latency. In an end-to-end experiment, we reduced cost by a factor of 14.5.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cormode:2011:VCS, author = "Graham Cormode and Justin Thaler and Ke Yi", title = "Verifying computations with streaming interactive proofs", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "1", pages = "25--36", month = sep, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:06 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "When computation is outsourced, the data owner would like to be assured that the desired computation has been performed correctly by the service provider. In theory, proof systems can give the necessary assurance, but prior work is not sufficiently scalable or practical. In this paper, we develop new proof protocols for verifying computations which are streaming in nature: the verifier (data owner) needs only logarithmic space and a single pass over the input, and after observing the input follows a simple protocol with a prover (service provider) that takes logarithmic communication spread over a logarithmic number of rounds. These ensure that the computation is performed correctly: that the service provider has not made any errors or missed out some data. The guarantee is very strong: even if the service provider deliberately tries to cheat, there is only vanishingly small probability of doing so undetected, while a correct computation is always accepted. We first observe that some theoretical results can be modified to work with streaming verifiers, showing that there are efficient protocols for problems in the complexity classes NP and NC. Our main results then seek to bridge the gap between theory and practice by developing usable protocols for a variety of problems of central importance in streaming and database processing. All these problems require linear space in the traditional streaming model, and therefore our protocols demonstrate that adding a prover can exponentially reduce the effort needed by the verifier. Our experimental results show that our protocols are practical and scalable.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lin:2011:MOI, author = "Dan Lin and Christian S. Jensen and Rui Zhang and Lu Xiao and Jiaheng Lu", title = "A moving-object index for efficient query processing with peer-wise location privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "1", pages = "37--48", month = sep, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:06 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the growing use of location-based services, location privacy attracts increasing attention from users, industry, and the research community. While considerable effort has been devoted to inventing techniques that prevent service providers from knowing a user's exact location, relatively little attention has been paid to enabling so-called peer-wise privacy---the protection of a user's location from unauthorized peer users. This paper identifies an important efficiency problem in existing peer-privacy approaches that simply apply a filtering step to identify users that are located in a query range, but that do not want to disclose their location to the querying peer. To solve this problem, we propose a novel, privacy-policy enabled index called the PEB-tree that seamlessly integrates location proximity and policy compatibility. We propose efficient algorithms that use the PEB-tree for processing privacy-aware range and $k$ NN queries. Extensive experiments suggest that the PEB-tree enables efficient query processing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mansour:2011:EES, author = "Essam Mansour and Amin Allam and Spiros Skiadopoulos and Panos Kalnis", title = "{ERA}: efficient serial and parallel suffix tree construction for very long strings", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "1", pages = "49--60", month = sep, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:06 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The suffix tree is a data structure for indexing strings. It is used in a variety of applications such as bioinformatics, time series analysis, clustering, text editing and data compression. However, when the string and the resulting suffix tree are too large to fit into the main memory, most existing construction algorithms become very inefficient. This paper presents a disk-based suffix tree construction method, called Elastic Range (ERa), which works efficiently with very long strings that are much larger than the available memory. ERa partitions the tree construction process horizontally and vertically and minimizes I/Os by dynamically adjusting the horizontal partitions independently for each vertical partition, based on the evolving shape of the tree and the available memory. Where appropriate, ERa also groups vertical partitions together to amortize the I/O cost. We developed a serial version; a parallel version for shared-memory and shared-disk multi-core systems; and a parallel version for shared-nothing architectures. ERa indexes the entire human genome in 19 minutes on an ordinary desktop computer. For comparison, the fastest existing method needs 15 minutes using 1024 CPUs on an IBM BlueGene supercomputer.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Krueger:2011:FUR, author = "Jens Krueger and Changkyu Kim and Martin Grund and Nadathur Satish and David Schwalb and Jatin Chhugani and Hasso Plattner and Pradeep Dubey and Alexander Zeier", title = "Fast updates on read-optimized databases using multi-core {CPUs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "1", pages = "61--72", month = sep, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:06 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Read-optimized columnar databases use differential updates to handle writes by maintaining a separate write-optimized delta partition which is periodically merged with the read-optimized and compressed main partition. This merge process introduces significant overheads and unacceptable downtimes in update intensive systems, aspiring to combine transactional and analytical workloads into one system. In the first part of the paper, we report data analyses of 12 SAP Business Suite customer systems. In the second half, we present an optimized merge process reducing the merge overhead of current systems by a factor of 30. Our linear-time merge algorithm exploits the underlying high compute and bandwidth resources of modern multi-core CPUs with architecture-aware optimizations and efficient parallelization. This enables compressed in-memory column stores to handle the transactional update rate required by enterprise applications, while keeping properties of read-optimized databases for analytic-style queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Goyal:2011:DBA, author = "Amit Goyal and Francesco Bonchi and Laks V. S. Lakshmanan", title = "A data-based approach to social influence maximization", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "1", pages = "73--84", month = sep, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:06 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Influence maximization is the problem of finding a set of users in a social network, such that by targeting this set, one maximizes the expected spread of influence in the network. Most of the literature on this topic has focused exclusively on the social graph, overlooking historical data, i.e., traces of past action propagations. In this paper, we study influence maximization from a novel data-based perspective. In particular, we introduce a new model, which we call credit distribution, that directly leverages available propagation traces to learn how influence flows in the network and uses this to estimate expected influence spread. Our approach also learns the different levels of influence-ability of users, and it is time-aware in the sense that it takes the temporal nature of influence into account. We show that influence maximization under the credit distribution model is NP -hard and that the function that defines expected spread under our model is submodular. Based on these, we develop an approximation algorithm for solving the influence maximization problem that at once enjoys high accuracy compared to the standard approach, while being several orders of magnitude faster and more scalable.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pavlo:2011:PMO, author = "Andrew Pavlo and Evan P. C. Jones and Stanley Zdonik", title = "On predictive modeling for optimizing transaction execution in parallel {OLTP} systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "2", pages = "85--96", month = oct, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:08 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A new emerging class of parallel database management systems (DBMS) is designed to take advantage of the partitionable workloads of on-line transaction processing (OLTP) applications [23, 20]. Transactions in these systems are optimized to execute to completion on a single node in a shared-nothing cluster without needing to coordinate with other nodes or use expensive concurrency control measures [18]. But some OLTP applications cannot be partitioned such that all of their transactions execute within a single-partition in this manner. These distributed transactions access data not stored within their local partitions and subsequently require more heavy-weight concurrency control protocols. Further difficulties arise when the transaction's execution properties, such as the number of partitions it may need to access or whether it will abort, are not known beforehand. The DBMS could mitigate these performance issues if it is provided with additional information about transactions. Thus, in this paper we present a Markov model-based approach for automatically selecting which optimizations a DBMS could use, namely (1) more efficient concurrency control schemes, (2) intelligent scheduling, (3) reduced undo logging, and (4) speculative execution. To evaluate our techniques, we implemented our models and integrated them into a parallel, main-memory OLTP DBMS to show that we can improve the performance of applications with diverse workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Goasdoue:2011:VSS, author = "Fran{\c{c}}ois Goasdou{\'e} and Konstantinos Karanasos and Julien Leblay and Ioana Manolescu", title = "View selection in {Semantic Web} databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "2", pages = "97--108", month = oct, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:08 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We consider the setting of a Semantic Web database, containing both explicit data encoded in RDF triples, and implicit data, implied by the RDF semantics. Based on a query workload, we address the problem of selecting a set of views to be materialized in the database, minimizing a combination of query processing, view storage, and view maintenance costs. Starting from an existing relational view selection method, we devise new algorithms for recommending view sets, and show that they scale significantly beyond the existing relational ones when adapted to the RDF context. To account for implicit triples in query answers, we propose a novel RDF query reformulation algorithm and an innovative way of incorporating it into view selection in order to avoid a combinatorial explosion in the complexity of the selection process. The interest of our techniques is demonstrated through a set of experiments.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jestes:2011:BWH, author = "Jeffrey Jestes and Ke Yi and Feifei Li", title = "Building wavelet histograms on large data in {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "2", pages = "109--120", month = oct, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:08 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "MapReduce is becoming the de facto framework for storing and processing massive data, due to its excellent scalability, reliability, and elasticity. In many MapReduce applications, obtaining a compact accurate summary of data is essential. Among various data summarization tools, histograms have proven to be particularly important and useful for summarizing data, and the wavelet histogram is one of the most widely used histograms. In this paper, we investigate the problem of building wavelet histograms efficiently on large datasets in MapReduce. We measure the efficiency of the algorithms by both end-to-end running time and communication cost. We demonstrate straightforward adaptations of existing exact and approximate methods for building wavelet histograms to MapReduce clusters are highly inefficient. To that end, we design new algorithms for computing exact and approximate wavelet histograms and discuss their implementation in MapReduce. We illustrate our techniques in Hadoop, and compare to baseline solutions with extensive experiments performed in a heterogeneous Hadoop cluster of 16 nodes, using large real and synthetic datasets, up to hundreds of gigabytes. The results suggest significant (often orders of magnitude) performance improvement achieved by our new algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2011:SMD, author = "Di Yang and Elke A. Rundensteiner and Matthew O. Ward", title = "Summarization and matching of density-based clusters in streaming environments", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "2", pages = "121--132", month = oct, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:08 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Density-based cluster mining is known to serve a broad range of applications ranging from stock trade analysis to moving object monitoring. Although methods for efficient extraction of density-based clusters have been studied in the literature, the problem of summarizing and matching of such clusters with arbitrary shapes and complex cluster structures remains unsolved. Therefore, the goal of our work is to extend the state-of-art of density-based cluster mining in streams from cluster extraction only to now also support analysis and management of the extracted clusters. Our work solves three major technical challenges. First, we propose a novel multi-resolution cluster summarization method, called Skeletal Grid Summarization (SGS), which captures the key features of density-based clusters, covering both their external shape and internal cluster structures. Second, in order to summarize the extracted clusters in real-time, we present an integrated computation strategy C-SGS, which piggybacks the generation of cluster summarizations within the online clustering process. Lastly, we design a mechanism to efficiently execute cluster matching queries, which identify similar clusters for given cluster of analyst's interest from clusters extracted earlier in the stream history. Our experimental study using real streaming data shows the clear superiority of our proposed methods in both efficiency and effectiveness for cluster summarization and cluster matching queries to other potential alternatives.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nguyen:2011:MSM, author = "Thanh Nguyen and Viviane Moreira and Huong Nguyen and Hoa Nguyen and Juliana Freire", title = "Multilingual schema matching for {Wikipedia} infoboxes", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "2", pages = "133--144", month = oct, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:08 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent research has taken advantage of Wikipedia's multi-lingualism as a resource for cross-language information retrieval and machine translation, as well as proposed techniques for enriching its cross-language structure. The availability of documents in multiple languages also opens up new opportunities for querying structured Wikipedia content, and in particular, to enable answers that straddle different languages. As a step towards supporting such queries, in this paper, we propose a method for identifying mappings between attributes from infoboxes that come from pages in different languages. Our approach finds mappings in a completely automated fashion. Because it does not require training data, it is scalable: not only can it be used to find mappings between many language pairs, but it is also effective for languages that are under-represented and lack sufficient training samples. Another important benefit of our approach is that it does not depend on syntactic similarity between attribute names, and thus, it can be applied to language pairs that have distinct morphologies. We have performed an extensive experimental evaluation using a corpus consisting of pages in Portuguese, Vietnamese, and English. The results show that not only does our approach obtain high precision and recall, but it also outperforms state-of-the-art techniques. We also present a case study which demonstrates that the multilingual mappings we derive lead to substantial improvements in answer quality and coverage for structured queries over Wikipedia content.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2011:CFP, author = "Guimei Liu and Haojun Zhang and Limsoon Wong", title = "Controlling false positives in association rule mining", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "2", pages = "145--156", month = oct, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:08 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Association rule mining is an important problem in the data mining area. It enumerates and tests a large number of rules on a dataset and outputs rules that satisfy user-specified constraints. Due to the large number of rules being tested, rules that do not represent real systematic effect in the data can satisfy the given constraints purely by random chance. Hence association rule mining often suffers from a high risk of false positive errors. There is a lack of comprehensive study on controlling false positives in association rule mining. In this paper, we adopt three multiple testing correction approaches---the direct adjustment approach, the permutation-based approach and the holdout approach---to control false positives in association rule mining, and conduct extensive experiments to study their performance. Our results show that (1) Numerous spurious rules are generated if no correction is made. (2) The three approaches can control false positives effectively. Among the three approaches, the permutation-based approach has the highest power of detecting real association rules, but it is very computationally expensive. We employ several techniques to reduce its cost effectively.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Suchanek:2011:PPA, author = "Fabian M. Suchanek and Serge Abiteboul and Pierre Senellart", title = "{PARIS}: probabilistic alignment of relations, instances, and schema", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "3", pages = "157--168", month = nov, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:09 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "One of the main challenges that the Semantic Web faces is the integration of a growing number of independently designed ontologies. In this work, we present paris, an approach for the automatic alignment of ontologies. paris aligns not only instances, but also relations and classes. Alignments at the instance level cross-fertilize with alignments at the schema level. Thereby, our system provides a truly holistic solution to the problem of ontology alignment. The heart of the approach is probabilistic, i.e., we measure degrees of matchings based on probability estimates. This allows paris to run without any parameter tuning. We demonstrate the efficiency of the algorithm and its precision through extensive experiments. In particular, we obtain a precision of around 90\% in experiments with some of the world's largest ontologies.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ranu:2011:ATQ, author = "Sayan Ranu and Ambuj K. Singh", title = "Answering top-$k$ queries over a mixture of attractive and repulsive dimensions", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "3", pages = "169--180", month = nov, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:09 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we formulate a top-$k$ query that compares objects in a database to a user-provided query object on a novel scoring function. The proposed scoring function combines the idea of attractive and repulsive dimensions into a general framework to overcome the weakness of traditional distance or similarity measures. We study the properties of the proposed class of scoring functions and develop efficient and scalable index structures that index the isolines of the function. We demonstrate various scenarios where the query finds application. Empirical evaluation demonstrates a performance gain of one to two orders of magnitude on querying time over existing state-of-the-art top-$k$ techniques. Further, a qualitative analysis is performed on a real dataset to highlight the potential of the proposed query in discovering hidden data characteristics.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Armbrust:2011:PST, author = "Michael Armbrust and Kristal Curtis and Tim Kraska and Armando Fox and Michael J. Franklin and David A. Patterson", title = "{PIQL}: success-tolerant query processing in the cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "3", pages = "181--192", month = nov, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:09 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Newly-released web applications often succumb to a ``Success Disaster,'' where overloaded database machines and resulting high response times destroy a previously good user experience. Unfortunately, the data independence provided by a traditional relational database system, while useful for agile development, only exacerbates the problem by hiding potentially expensive queries under simple declarative expressions. As a result, developers of these applications are increasingly abandoning relational databases in favor of imperative code written against distributed key/value stores, losing the many benefits of data independence in the process. Instead, we propose PIQL, a declarative language that also provides scale independence by calculating an upper bound on the number of key/value store operations that will be performed for any query. Coupled with a service level objective (SLO) compliance prediction model and PIQL's scalable database architecture, these bounds make it easy for developers to write success-tolerant applications that support an arbitrarily large number of users while still providing acceptable performance. In this paper, we present the PIQL query processing system and evaluate its scale independence on hundreds of machines using two benchmarks, TPC-W and SCADr.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhao:2011:GQE, author = "Peixiang Zhao and Charu C. Aggarwal and Min Wang", title = "{gSketch}: on query estimation in graph streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "3", pages = "193--204", month = nov, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:09 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many dynamic applications are built upon large network infrastructures, such as social networks, communication networks, biological networks and the Web. Such applications create data that can be naturally modeled as graph streams, in which edges of the underlying graph are received and updated sequentially in a form of a stream. It is often necessary and important to summarize the behavior of graph streams in order to enable effective query processing. However, the sheer size and dynamic nature of graph streams present an enormous challenge to existing graph management techniques. In this paper, we propose a new graph sketch method, gSketch, which combines well studied synopses for traditional data streams with a sketch partitioning technique, to estimate and optimize the responses to basic queries on graph streams. We consider two different scenarios for query estimation: (1) A graph stream sample is available; (2) Both a graph stream sample and a query workload sample are available. Algorithms for different scenarios are designed respectively by partitioning a global sketch to a group of localized sketches in order to optimize the query estimation accuracy. We perform extensive experimental studies on both real and synthetic data sets and demonstrate the power and robustness of gSketch in comparison with the state-of-the-art global sketch method.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ruttenberg:2011:IEM, author = "Brian E. Ruttenberg and Ambuj K. Singh", title = "Indexing the earth mover's distance using normal distributions", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "3", pages = "205--216", month = nov, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:09 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Querying uncertain data sets (represented as probability distributions) presents many challenges due to the large amount of data involved and the difficulties comparing uncertainty between distributions. The Earth Mover's Distance (EMD) has increasingly been employed to compare uncertain data due to its ability to effectively capture the differences between two distributions. Computing the EMD entails finding a solution to the transportation problem, which is computationally intensive. In this paper, we propose a new lower bound to the EMD and an index structure to significantly improve the performance of EMD based K-- nearest neighbor (K--NN) queries on uncertain databases. We propose a new lower bound to the EMD that approximates the EMD on a projection vector. Each distribution is projected onto a vector and approximated by a normal distribution, as well as an accompanying error term. We then represent each normal as a point in a Hough transformed space. We then use the concept of stochastic dominance to implement an efficient index structure in the transformed space. We show that our method significantly decreases K--NN query time on uncertain databases. The index structure also scales well with database cardinality. It is well suited for heterogeneous data sets, helping to keep EMD based queries tractable as uncertain data sets become larger and more complex.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qumsiyeh:2011:GER, author = "Rani Qumsiyeh and Maria S. Pera and Yiu-Kai Ng", title = "Generating exact- and ranked partially-matched answers to questions in advertisements", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "3", pages = "217--228", month = nov, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:09 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Taking advantage of the Web, many advertisements (ads for short) websites, which aspire to increase client's transactions and thus profits, offer searching tools which allow users to (i) post keyword queries to capture their information needs or (ii) invoke form-based interfaces to create queries by selecting search options, such as a price range, filled-in entries, check boxes, or drop-down menus. These search mechanisms, however, are inadequate, since they cannot be used to specify a natural-language query with rich syntactic and semantic content, which can only be handled by a question answering (QA) system. Furthermore, existing ads websites are incapable of evaluating arbitrary Boolean queries or retrieving partially-matched answers that might be of interest to the user whenever a user's search yields only a few or no results at all. In solving these problems, we present a QA system for ads, called CQAds, which (i) allows users to post a natural-language question Q for retrieving relevant ads, if they exist, (ii) identifies ads as answers that partially-match the requested information expressed in Q, if insufficient or no answers to Q can be retrieved, which are ordered using a similarity-ranking approach, and (iii) analyzes incomplete or ambiguous questions to perform the ``best guess'' in retrieving answers that ``best match'' the selection criteria specified in Q. CQAds is also equipped with a Boolean model to evaluate Boolean operators that are either explicitly or implicitly specified in Q, i.e., with or without Boolean operators specified by the users, respectively. CQAds is easy to use, scalable to all ads domains, and more powerful than search tools provided by existing ads websites, since its query-processing strategy retrieves relevant ads of higher quality and quantity. We have verified the accuracy of CQAds in retrieving ads on eight ads domains and compared its ranking strategy with other well-known ranking approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fakas:2011:SOS, author = "Georgios J. Fakas and Zhi Cai and Nikos Mamoulis", title = "Size-$l$ object summaries for relational keyword search", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "3", pages = "229--240", month = nov, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:09 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A previously proposed keyword search paradigm produces, as a query result, a ranked list of Object Summaries (OSs). An OS is a tree structure of related tuples that summarizes all data held in a relational database about a particular Data Subject (DS). However, some of these OSs are very large in size and therefore unfriendly to users that initially prefer synoptic information before proceeding to more comprehensive information about a particular DS. In this paper, we investigate the effective and efficient retrieval of concise and informative OSs. We argue that a good size- l OS should be a stand-alone and meaningful synopsis of the most important information about the particular DS. More precisely, we define a size- l OS as a partial OS composed of l important tuples. We propose three algorithms for the efficient generation of size- l OSs (in addition to the optimal approach which requires exponential time). Experimental evaluation on DBLP and TPC-H databases verifies the effectiveness and efficiency of our approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fang:2011:RER, author = "Lujun Fang and Anish Das Sarma and Cong Yu and Philip Bohannon", title = "{REX}: explaining relationships between entity pairs", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "3", pages = "241--252", month = nov, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:09 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Knowledge bases of entities and relations (either constructed manually or automatically) are behind many real world search engines, including those at Yahoo!, Microsoft, and Google. Those knowledge bases can be viewed as graphs with nodes representing entities and edges representing (primary) relationships, and various studies have been conducted on how to leverage them to answer entity seeking queries. Meanwhile, in a complementary direction, analyses over the query logs have enabled researchers to identify entity pairs that are statistically correlated. Such entity relationships are then presented to search users through the ``related searches'' feature in modern search engines. However, entity relationships thus discovered can often be ``puzzling'' to the users because why the entities are connected is often indescribable. In this paper, we propose a novel problem called entity relationship explanation, which seeks to explain why a pair of entities are connected, and solve this challenging problem by integrating the above two complementary approaches, i.e., we leverage the knowledge base to ``explain'' the connections discovered between entity pairs. More specifically, we present REX, a system that takes a pair of entities in a given knowledge base as input and efficiently identifies a ranked list of relationship explanations. We formally define relationship explanations and analyze their desirable properties. Furthermore, we design and implement algorithms to efficiently enumerate and rank all relationship explanations based on multiple measures of ``interestingness.'' We perform extensive experiments over real web-scale data gathered from DBpedia and a commercial search engine, demonstrating the efficiency and scalability of REX. We also perform user studies to corroborate the effectiveness of explanations generated by REX.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2011:PJP, author = "Guoliang Li and Dong Deng and Jiannan Wang and Jianhua Feng", title = "{Pass-join}: a partition-based method for similarity joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "3", pages = "253--264", month = nov, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:09 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As an essential operation in data cleaning, the similarity join has attracted considerable attention from the database community. In this paper, we study string similarity joins with edit-distance constraints, which find similar string pairs from two large sets of strings whose edit distance is within a given threshold. Existing algorithms are efficient either for short strings or for long strings, and there is no algorithm that can efficiently and adaptively support both short strings and long strings. To address this problem, we propose a partition-based method called Pass-Join. Pass-Join partitions a string into a set of segments and creates inverted indices for the segments. Then for each string, Pass-Join selects some of its substrings and uses the selected substrings to find candidate pairs using the inverted indices. We devise efficient techniques to select the substrings and prove that our method can minimize the number of selected substrings. We develop novel pruning techniques to efficiently verify the candidate pairs. Experimental results show that our algorithms are efficient for both short strings and long strings, and outperform state-of-the-art methods on real datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hoobin:2011:RLZ, author = "Christopher Hoobin and Simon J. Puglisi and Justin Zobel", title = "Relative {Lempel--Ziv} factorization for efficient storage and retrieval of {Web} collections", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "3", pages = "265--273", month = nov, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:09 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Compression techniques that support fast random access are a core component of any information system. Current state-of-the-art methods group documents into fixed-sized blocks and compress each block with a general-purpose adaptive algorithm such as gzip. Random access to a specific document then requires decompression of a block. The choice of block size is critical: it trades between compression effectiveness and document retrieval times. In this paper we present a scalable compression method for large document collections that allows fast random access. We build a representative sample of the collection and use it as a dictionary in a LZ77-like encoding of the rest of the collection, relative to the dictionary. We demonstrate on large collections, that using a dictionary as small as 0.1\% of the collection size, our algorithm is dramatically faster than previous methods, and in general gives much better compression.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2011:TCE, author = "Ning Zhang and Junichi Tatemura and Jignesh M. Patel and Hakan Hacig{\"u}m{\"u}s", title = "Towards cost-effective storage provisioning for {DBMSs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "4", pages = "274--285", month = dec, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:11 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data center operators face a bewildering set of choices when considering how to provision resources on machines with complex I/O subsystems. Modern I/O subsystems often have a rich mix of fast, high performing, but expensive SSDs sitting alongside with cheaper but relatively slower (for random accesses) traditional hard disk drives. The data center operators need to determine how to provision the I/O resources for specific workloads so as to abide by existing Service Level Agreements (SLAs), while minimizing the total operating cost (TOC) of running the workload, where the TOC includes the amortized hardware costs and the run time energy costs. The focus of this paper is on introducing this new problem of TOC-based storage allocation, cast in a framework that is compatible with traditional DBMS query optimization and query processing architecture. We also present a heuristic-based solution to this problem, called DOT. We have implemented DOT in PostgreSQL, and experiments using TPC-H and TPC-C demonstrate significant TOC reduction by DOT in various settings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Roh:2011:BTI, author = "Hongchan Roh and Sanghyun Park and Sungho Kim and Mincheol Shin and Sang-Won Lee", title = "{B+}-tree index optimization by exploiting internal parallelism of flash-based solid state drives", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "4", pages = "286--297", month = dec, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:11 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Previous research addressed the potential problems of the hard-disk oriented design of DBMSs of flashSSDs. In this paper, we focus on exploiting potential benefits of flashSSDs. First, we examine the internal parallelism issues of flashSSDs by conducting benchmarks to various flashSSDs. Then, we suggest algorithm-design principles in order to best benefit from the internal parallelism. We present a new I/O request concept, called psync I/O that can exploit the internal parallelism of flashSSDs in a single process. Based on these ideas, we introduce B+-tree optimization methods in order to utilize internal parallelism. By integrating the results of these methods, we present a B+-tree variant, PIO B-tree. We confirmed that each optimization method substantially enhances the index performance. Consequently, PIO B-tree enhanced B+-tree's insert performance by a factor of up to 16.3, while improving point-search performance by a factor of 1.2. The range search of PIO B-tree was up to 5 times faster than that of the B+-tree. Moreover, PIO B-tree outperformed other flash-aware indexes in various synthetic workloads. We also confirmed that PIO B-tree outperforms B+-tree in index traces collected inside the PostgreSQL DBMS with TPC-C benchmark.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Larson:2011:HPC, author = "Per-{\AA}ke Larson and Spyros Blanas and Cristian Diaconu and Craig Freedman and Jignesh M. Patel and Mike Zwilling", title = "High-performance concurrency control mechanisms for main-memory databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "4", pages = "298--309", month = dec, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:11 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A database system optimized for in-memory storage can support much higher transaction rates than current systems. However, standard concurrency control methods used today do not scale to the high transaction rates achievable by such systems. In this paper we introduce two efficient concurrency control methods specifically designed for main-memory databases. Both use multiversioning to isolate read-only transactions from updates but differ in how atomicity is ensured: one is optimistic and one is pessimistic. To avoid expensive context switching, transactions never block during normal processing but they may have to wait before commit to ensure correct serialization ordering. We also implemented a main-memory optimized version of single-version locking. Experimental results show that while single-version locking works well when transactions are short and contention is low performance degrades under more demanding conditions. The multiversion schemes have higher overhead but are much less sensitive to hotspots and the presence of long-running transactions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ma:2011:CTG, author = "Shuai Ma and Yang Cao and Wenfei Fan and Jinpeng Huai and Tianyu Wo", title = "Capturing topology in graph pattern matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "4", pages = "310--321", month = dec, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:11 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph pattern matching is often defined in terms of subgraph isomorphism, an np-complete problem. To lower its complexity, various extensions of graph simulation have been considered instead. These extensions allow pattern matching to be conducted in cubic-time. However, they fall short of capturing the topology of data graphs, i.e., graphs may have a structure drastically different from pattern graphs they match, and the matches found are often too large to understand and analyze. To rectify these problems, this paper proposes a notion of strong simulation, a revision of graph simulation, for graph pattern matching. (1) We identify a set of criteria for preserving the topology of graphs matched. We show that strong simulation preserves the topology of data graphs and finds a bounded number of matches. (2) We show that strong simulation retains the same complexity as earlier extensions of simulation, by providing a cubic-time algorithm for computing strong simulation. (3) We present the locality property of strong simulation, which allows us to effectively conduct pattern matching on distributed graphs. (4) We experimentally verify the effectiveness and efficiency of these algorithms, using real-life data and synthetic data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kumar:2011:PMO, author = "Arun Kumar and Christopher R{\'e}", title = "Probabilistic management of {OCR} data using an {RDBMS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "4", pages = "322--333", month = dec, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:11 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The digitization of scanned forms and documents is changing the data sources that enterprises manage. To integrate these new data sources with enterprise data, the current state-of-the-art approach is to convert the images to ASCII text using optical character recognition (OCR) software and then to store the resulting ASCII text in a relational database. The OCR problem is challenging, and so the output of OCR often contains errors. In turn, queries on the output of OCR may fail to retrieve relevant answers. State-of-the-art OCR programs, e.g., the OCR powering Google Books, use a probabilistic model that captures many alternatives during the OCR process. Only when the results of OCR are stored in the database, do these approaches discard the uncertainty. In this work, we propose to retain the probabilistic models produced by OCR process in a relational database management system. A key technical challenge is that the probabilistic data produced by OCR software is very large (a single book blows up to 2GB from 400kB as ASCII). As a result, a baseline solution that integrates these models with an RDBMS is over 1000x slower versus standard text processing for single table select-project queries. However, many applications may have quality-performance needs that are in between these two extremes of ASCII and the complete model output by the OCR software. Thus, we propose a novel approximation scheme called Staccato that allows a user to trade recall for query performance. Additionally, we provide a formal analysis of our scheme's properties, and describe how we integrate our scheme with standard-RDBMS text indexing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pawlik:2011:RRA, author = "Mateusz Pawlik and Nikolaus Augsten", title = "{RTED}: a robust algorithm for the tree edit distance", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "4", pages = "334--345", month = dec, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:11 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We consider the classical tree edit distance between ordered labeled trees, which is defined as the minimum-cost sequence of node edit operations that transform one tree into another. The state-of-the-art solutions for the tree edit distance are not satisfactory. The main competitors in the field either have optimal worst-case complexity, but the worst case happens frequently, or they are very efficient for some tree shapes, but degenerate for others. This leads to unpredictable and often infeasible runtimes. There is no obvious way to choose between the algorithms. In this paper we present RTED, a robust tree edit distance algorithm. The asymptotic complexity of RTED is smaller or equal to the complexity of the best competitors for any input instance, i.e., RTED is both efficient and worst-case optimal. We introduce the class of LRH (Left-Right-Heavy) algorithms, which includes RTED and the fastest tree edit distance algorithms presented in literature. We prove that RTED outperforms all previously proposed LRH algorithms in terms of runtime complexity. In our experiments on synthetic and real world data we empirically evaluate our solution and compare it to the state-of-the-art.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Amsterdamer:2011:PLP, author = "Yael Amsterdamer and Susan B. Davidson and Daniel Deutch and Tova Milo and Julia Stoyanovich and Val Tannen", title = "Putting lipstick on pig: enabling database-style workflow provenance", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "4", pages = "346--357", month = dec, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:11 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Workflow provenance typically assumes that each module is a ``black-box'', so that each output depends on all inputs (coarse-grained dependencies). Furthermore, it does not model the internal state of a module, which can change between repeated executions. In practice, however, an output may depend on only a small subset of the inputs (fine-grained dependencies) as well as on the internal state of the module. We present a novel provenance framework that marries database-style and workflow-style provenance, by using Pig Latin to expose the functionality of modules, thus capturing internal state and fine-grained dependencies. A critical ingredient in our solution is the use of a novel form of provenance graph that models module invocations and yields a compact representation of fine-grained workflow provenance. It also enables a number of novel graph transformation operations, allowing to choose the desired level of granularity in provenance querying (ZoomIn and ZoomOut), and supporting ``what-if'' workflow analytic queries. We implemented our approach in the Lipstick system and developed a benchmark in support of a systematic performance evaluation. Our results demonstrate the feasibility of tracking and querying fine-grained workflow provenance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gao:2011:RAS, author = "Jun Gao and Ruoming Jin and Jiashuai Zhou and Jeffrey Xu Yu and Xiao Jiang and Tengjiao Wang", title = "Relational approach for shortest path discovery over large graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "4", pages = "358--369", month = dec, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:11 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the rapid growth of large graphs, we cannot assume that graphs can still be fully loaded into memory, thus the disk-based graph operation is inevitable. In this paper, we take the shortest path discovery as an example to investigate the technique issues when leveraging existing infrastructure of relational database (RDB) in the graph data management. Based on the observation that a variety of graph search queries can be implemented by iterative operations including selecting frontier nodes from visited nodes, making expansion from the selected frontier nodes, and merging the expanded nodes into the visited ones, we introduce a relational FEM framework with three corresponding operators to implement graph search tasks in the RDB context. We show new features such as window function and merge statement introduced by recent SQL standards can not only simplify the expression but also improve the performance of the FEM framework. In addition, we propose two optimization strategies specific to shortest path discovery inside the FEM framework. First, we take a bi-directional set Dijkstra's algorithm in the path finding. The bi-directional strategy can reduce the search space, and set Dijkstra's algorithm finds the shortest path in a set-at-a-time fashion. Second, we introduce an index named SegTable to preserve the local shortest segments, and exploit SegTable to further improve the performance. The final extensive experimental results illustrate our relational approach with the optimization strategies achieves high scalability and performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Barsky:2011:MFC, author = "Marina Barsky and Sangkyum Kim and Tim Weninger and Jiawei Han", title = "Mining flipping correlations from large datasets with taxonomies", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "4", pages = "370--381", month = dec, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:11 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper we introduce a new type of pattern --- a flipping correlation pattern. The flipping patterns are obtained from contrasting the correlations between items at different levels of abstraction. They represent surprising correlations, both positive and negative, which are specific for a given abstraction level, and which ``flip'' from positive to negative and vice versa when items are generalized to a higher level of abstraction. We design an efficient algorithm for finding flipping correlations, the Flipper algorithm, which outperforms na{\"\i}ve pattern mining methods by several orders of magnitude. We apply Flipper to real-life datasets and show that the discovered patterns are non-redundant, surprising and actionable. Flipper finds strong contrasting correlations in itemsets with low-to-medium support, while existing techniques cannot handle the pattern discovery in this frequency range.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Konig:2011:SAT, author = "Arnd Christian K{\"o}nig and Bolin Ding and Surajit Chaudhuri and Vivek Narasayya", title = "A statistical approach towards robust progress estimation", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "4", pages = "382--393", month = dec, year = "2011", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:11 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The need for accurate SQL progress estimation in the context of decision support administration has led to a number of techniques proposed for this task. Unfortunately, no single one of these progress estimators behaves robustly across the variety of SQL queries encountered in practice, meaning that each technique performs poorly for a significant fraction of queries. This paper proposes a novel estimator selection framework that uses a statistical model to characterize the sets of conditions under which certain estimators outperform others, leading to a significant increase in estimation robustness. The generality of this framework also enables us to add a number of novel ``special purpose'' estimators which increase accuracy further. Most importantly, the resulting model generalizes well to queries very different from the ones used to train it. We validate our findings using a large number of industrial real-life and benchmark workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sun:2012:RSA, author = "Yizhou Sun and Charu C. Aggarwal and Jiawei Han", title = "Relation strength-aware clustering of heterogeneous information networks with incomplete attributes", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "5", pages = "394--405", month = jan, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:13 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the rapid development of online social media, online shopping sites and cyber-physical systems, heterogeneous information networks have become increasingly popular and content-rich over time. In many cases, such networks contain multiple types of objects and links, as well as different kinds of attributes. The clustering of these objects can provide useful insights in many applications. However, the clustering of such networks can be challenging since (a) the attribute values of objects are often incomplete, which implies that an object may carry only partial attributes or even no attributes to correctly label itself; and (b) the links of different types may carry different kinds of semantic meanings, and it is a difficult task to determine the nature of their relative importance in helping the clustering for a given purpose. In this paper, we address these challenges by proposing a model-based clustering algorithm. We design a probabilistic model which clusters the objects of different types into a common hidden space, by using a user-specified set of attributes, as well as the links from different relations. The strengths of different types of links are automatically learned, and are determined by the given purpose of clustering. An iterative algorithm is designed for solving the clustering problem, in which the strengths of different types of links and the quality of clustering results mutually enhance each other. Our experimental results on real and synthetic data sets demonstrate the effectiveness and efficiency of the algorithm.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2012:SPD, author = "Lingkun Wu and Xiaokui Xiao and Dingxiong Deng and Gao Cong and Andy Diwen Zhu and Shuigeng Zhou", title = "Shortest path and distance queries on road networks: an experimental evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "5", pages = "406--417", month = jan, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:13 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Computing the shortest path between two given locations in a road network is an important problem that finds applications in various map services and commercial navigation products. The state-of-the-art solutions for the problem can be divided into two categories: spatial-coherence-based methods and vertex-importance-based approaches. The two categories of techniques, however, have not been compared systematically under the same experimental framework, as they were developed from two independent lines of research that do not refer to each other. This renders it difficult for a practitioner to decide which technique should be adopted for a specific application. Furthermore, the experimental evaluation of the existing techniques, as presented in previous work, falls short in several aspects. Some methods were tested only on small road networks with up to one hundred thousand vertices; some approaches were evaluated using distance queries (instead of shortest path queries), namely, queries that ask only for the length of the shortest path; a state-of-the-art technique was examined based on a faulty implementation that led to incorrect query results. To address the above issues, this paper presents a comprehensive comparison of the most advanced spatial-coherence-based and vertex-importance-based approaches. Using a variety of real road networks with up to twenty million vertices, we evaluated each technique in terms of its preprocessing time, space consumption, and query efficiency (for both shortest path and distance queries). Our experimental results reveal the characteristics of different techniques, based on which we provide guidelines on selecting appropriate methods for various scenarios.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Erdos:2012:FPP, author = "D{\'o}ra Erd{\H{o}}s and Vatche Ishakian and Andrei Lapets and Evimaria Terzi and Azer Bestavros", title = "The filter-placement problem and its application to minimizing information multiplicity", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "5", pages = "418--429", month = jan, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:13 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In many information networks, data items --- such as updates in social networks, news flowing through interconnected RSS feeds and blogs, measurements in sensor networks, route updates in ad-hoc networks --- propagate in an uncoordinated manner: nodes often relay information they receive to neighbors, independent of whether or not these neighbors received the same information from other sources. This uncoordinated data dissemination may result in significant, yet unnecessary communication and processing overheads, ultimately reducing the utility of information networks. To alleviate the negative impacts of this information multiplicity phenomenon, we propose that a subset of nodes (selected at key positions in the network) carry out additional information filtering functionality. Thus, nodes are responsible for the removal (or significant reduction) of the redundant data items relayed through them. We refer to such nodes as filters. We formally define the Filter Placement problem as a combinatorial optimization problem, and study its computational complexity for different types of graphs. We also present polynomial-time approximation algorithms and scalable heuristics for the problem. Our experimental results, which we obtained through extensive simulations on synthetic and real-world information flow networks, suggest that in many settings a relatively small number of filters are fairly effective in removing a large fraction of redundant information.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Satuluri:2012:BLS, author = "Venu Satuluri and Srinivasan Parthasarathy", title = "{Bayesian} locality sensitive hashing for fast similarity search", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "5", pages = "430--441", month = jan, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:13 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a collection of objects and an associated similarity measure, the all-pairs similarity search problem asks us to find all pairs of objects with similarity greater than a certain user-specified threshold. Locality-sensitive hashing (LSH) based methods have become a very popular approach for this problem. However, most such methods only use LSH for the first phase of similarity search --- i.e. efficient indexing for candidate generation. In this paper, we present BayesLSH, a principled Bayesian algorithm for the subsequent phase of similarity search --- performing candidate pruning and similarity estimation using LSH. A simpler variant, BayesLSH-Lite, which calculates similarities exactly, is also presented. Our algorithms are able to quickly prune away a large majority of the false positive candidate pairs, leading to significant speedups over baseline approaches. For BayesLSH, we also provide probabilistic guarantees on the quality of the output, both in terms of accuracy and recall. Finally, the quality of BayesLSH's output can be easily tuned and does not require any manual setting of the number of hashes to use for similarity estimation, unlike standard approaches. For two state-of-the-art candidate generation algorithms, AllPairs and LSH, BayesLSH enables significant speedups, typically in the range 2x-20x for a wide variety of datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fujiwara:2012:FET, author = "Yasuhiro Fujiwara and Makoto Nakatsuji and Makoto Onizuka and Masaru Kitsuregawa", title = "Fast and exact top-$k$ search for random walk with restart", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "5", pages = "442--453", month = jan, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:13 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graphs are fundamental data structures and have been employed for centuries to model real-world systems and phenomena. Random walk with restart (RWR) provides a good proximity score between two nodes in a graph, and it has been successfully used in many applications such as automatic image captioning, recommender systems, and link prediction. The goal of this work is to find nodes that have top-$k$ highest proximities for a given node. Previous approaches to this problem find nodes efficiently at the expense of exactness. The main motivation of this paper is to answer, in the affirmative, the question, 'Is it possible to improve the search time without sacrificing the exactness?'. Our solution, K-dash, is based on two ideas: (1) It computes the proximity of a selected node efficiently by sparse matrices, and (2) It skips unnecessary proximity computations when searching for the top-$k$ nodes. Theoretical analyses show that K-dash guarantees result exactness. We perform comprehensive experiments to verify the efficiency of K-dash. The results show that K-dash can find top-$k$ nodes significantly faster than the previous approaches while it guarantees exactness.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bahmani:2012:DSS, author = "Bahman Bahmani and Ravi Kumar and Sergei Vassilvitskii", title = "Densest subgraph in streaming and {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "5", pages = "454--465", month = jan, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:13 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The problem of finding locally dense components of a graph is an important primitive in data analysis, with wide-ranging applications from community mining to spam detection and the discovery of biological network modules. In this paper we present new algorithms for finding the densest subgraph in the streaming model. For any $ \epsilon > 0 $, our algorithms make $ O(\log_{1 + \epsilon } n) $ passes over the input and find a subgraph whose density is guaranteed to be within a factor $ 2 (1 + \epsilon) $ of the optimum. Our algorithms are also easily parallelizable and we illustrate this by realizing them in the MapReduce model. In addition we perform extensive experimental evaluation on massive real-world graphs showing the performance and scalability of our algorithms in practice.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Silva:2012:MAS, author = "Arlei Silva and Wagner {Meira, Jr.} and Mohammed J. Zaki", title = "Mining attribute-structure correlated patterns in large attributed graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "5", pages = "466--477", month = jan, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:13 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this work, we study the correlation between attribute sets and the occurrence of dense subgraphs in large attributed graphs, a task we call structural correlation pattern mining. A structural correlation pattern is a dense subgraph induced by a particular attribute set. Existing methods are not able to extract relevant knowledge regarding how vertex attributes interact with dense subgraphs. Structural correlation pattern mining combines aspects of frequent itemset and quasi-clique mining problems. We propose statistical significance measures that compare the structural correlation of attribute sets against their expected values using null models. Moreover, we evaluate the interestingness of structural correlation patterns in terms of size and density. An efficient algorithm that combines search and pruning strategies in the identification of the most relevant structural correlation patterns is presented. We apply our method for the analysis of three real-world attributed graphs: a collaboration, a music, and a citation network, verifying that it provides valuable knowledge in a feasible time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Schnaitter:2012:SAI, author = "Karl Schnaitter and Neoklis Polyzotis", title = "Semi-automatic index tuning: keeping {DBAs} in the loop", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "5", pages = "478--489", month = jan, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:13 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To obtain a high level of system performance, a database administrator (DBA) must choose a set of indices that is appropriate for the workload. The system can aid in this challenging task by providing recommendations for the index configuration. We propose a new index recommendation technique, termed semi-automatic tuning, that keeps the DBA ``in the loop'' by generating recommendations that use feedback about the DBA's preferences. The technique also works online, which avoids the limitations of commercial tools that require the workload to be known in advance. The foundation of our approach is the Work Function Algorithm, which can solve a wide variety of online optimization problems with strong competitive guarantees. We present an experimental analysis that validates the benefits of semi-automatic tuning in a wide variety of conditions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fink:2012:APD, author = "Robert Fink and Larisa Han and Dan Olteanu", title = "Aggregation in probabilistic databases via knowledge compilation", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "5", pages = "490--501", month = jan, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:13 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents a query evaluation technique for positive relational algebra queries with aggregates on a representation system for probabilistic data based on the algebraic structures of semiring and semimodule. The core of our evaluation technique is a procedure that compiles semimodule and semiring expressions into so-called decomposition trees, for which the computation of the probability distribution can be done in time linear in the product of the sizes of the probability distributions represented by its nodes. We give syntactic characterisations of tractable queries with aggregates by exploiting the connection between query tractability and polynomial-time decomposition trees. A prototype of the technique is incorporated in the probabilistic database engine SPROUT. We report on performance experiments with custom datasets and TPC-H data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Halim:2012:SDC, author = "Felix Halim and Stratos Idreos and Panagiotis Karras and Roland H. C. Yap", title = "Stochastic database cracking: towards robust adaptive indexing in main-memory column-stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "6", pages = "502--513", month = feb, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:15 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern business applications and scientific databases call for inherently dynamic data storage environments. Such environments are characterized by two challenging features: (a) they have little idle system time to devote on physical design; and (b) there is little, if any, a priori workload knowledge, while the query and data workload keeps changing dynamically. In such environments, traditional approaches to index building and maintenance cannot apply. Database cracking has been proposed as a solution that allows on-the-fly physical data reorganization, as a collateral effect of query processing. Cracking aims to continuously and automatically adapt indexes to the workload at hand, without human intervention. Indexes are built incrementally, adaptively, and on demand. Nevertheless, as we show, existing adaptive indexing methods fail to deliver workload-robustness; they perform much better with random workloads than with others. This frailty derives from the inelasticity with which these approaches interpret each query as a hint on how data should be stored. Current cracking schemes blindly reorganize the data within each query's range, even if that results into successive expensive operations with minimal indexing benefit. In this paper, we introduce stochastic cracking, a significantly more resilient approach to adaptive indexing. Stochastic cracking also uses each query as a hint on how to reorganize data, but not blindly so; it gains resilience and avoids performance bottlenecks by deliberately applying certain arbitrary choices in its decision-making. Thereby, we bring adaptive indexing forward to a mature formulation that confers the workload-robustness previous approaches lacked. Our extensive experimental study verifies that stochastic cracking maintains the desired properties of original database cracking while at the same time it performs well with diverse realistic workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2012:AMA, author = "Chao Li and Gerome Miklau", title = "An adaptive mechanism for accurate query answering under differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "6", pages = "514--525", month = feb, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:15 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose a novel mechanism for answering sets of counting queries under differential privacy. Given a workload of counting queries, the mechanism automatically selects a different set of ``strategy'' queries to answer privately, using those answers to derive answers to the workload. The main algorithm proposed in this paper approximates the optimal strategy for any workload of linear counting queries. With no cost to the privacy guarantee, the mechanism improves significantly on prior approaches and achieves near-optimal error for many workloads, when applied under $ (\epsilon, \delta)$-differential privacy. The result is an adaptive mechanism which can help users achieve good utility without requiring that they reason carefully about the best formulation of their task.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Giannikis:2012:SKO, author = "Georgios Giannikis and Gustavo Alonso and Donald Kossmann", title = "{SharedDB}: killing one thousand queries with one stone", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "6", pages = "526--537", month = feb, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:15 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Traditional database systems are built around the query-at-a-time model. This approach tries to optimize performance in a best-effort way. Unfortunately, best effort is not good enough for many modern applications. These applications require response time guarantees in high load situations. This paper describes the design of a new database architecture that is based on batching queries and shared computation across possibly hundreds of concurrent queries and updates. Performance experiments with the TPC-W benchmark show that the performance of our implementation, SharedDB, is indeed robust across a wide range of dynamic workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Selke:2012:PBC, author = "Joachim Selke and Christoph Lofi and Wolf-Tilo Balke", title = "Pushing the boundaries of crowd-enabled databases with query-driven schema expansion", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "6", pages = "538--549", month = feb, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:15 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "By incorporating human workers into the query execution process crowd-enabled databases facilitate intelligent, social capabilities like completing missing data at query time or performing cognitive operators. But despite all their flexibility, crowd-enabled databases still maintain rigid schemas. In this paper, we extend crowd-enabled databases by flexible query-driven schema expansion, allowing the addition of new attributes to the database at query time. However, the number of crowd-sourced mini-tasks to fill in missing values may often be prohibitively large and the resulting data quality is doubtful. Instead of simple crowd-sourcing to obtain all values individually, we leverage the usergenerated data found in the Social Web: By exploiting user ratings we build perceptual spaces, i.e., highly-compressed representations of opinions, impressions, and perceptions of large numbers of users. Using few training samples obtained by expert crowd sourcing, we then can extract all missing data automatically from the perceptual space with high quality and at low costs. Extensive experiments show that our approach can boost both performance and quality of crowd-enabled databases, while also providing the flexibility to expand schemas in a query-driven fashion.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhao:2012:BAD, author = "Bo Zhao and Benjamin I. P. Rubinstein and Jim Gemmell and Jiawei Han", title = "A {Bayesian} approach to discovering truth from conflicting sources for data integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "6", pages = "550--561", month = feb, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:15 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In practical data integration systems, it is common for the data sources being integrated to provide conflicting information about the same entity. Consequently, a major challenge for data integration is to derive the most complete and accurate integrated records from diverse and sometimes conflicting sources. We term this challenge the truth finding problem. We observe that some sources are generally more reliable than others, and therefore a good model of source quality is the key to solving the truth finding problem. In this work, we propose a probabilistic graphical model that can automatically infer true records and source quality without any supervision. In contrast to previous methods, our principled approach leverages a generative process of two types of errors (false positive and false negative) by modeling two different aspects of source quality. In so doing, ours is also the first approach designed to merge multi-valued attribute types. Our method is scalable, due to an efficient sampling-based inference algorithm that needs very few iterations in practice and enjoys linear time complexity, with an even faster incremental variant. Experiments on two real world datasets show that our new method outperforms existing state-of-the-art approaches to the truth finding problem.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Upadhyaya:2012:HPS, author = "Prasang Upadhyaya and Magdalena Balazinska and Dan Suciu", title = "How to price shared optimizations in the cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "6", pages = "562--573", month = feb, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:15 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data-management-as-a-service systems are increasingly being used in collaborative settings, where multiple users access common datasets. Cloud providers have the choice to implement various optimizations, such as indexing or materialized views, to accelerate queries over these datasets. Each optimization carries a cost and may benefit multiple users. This creates a major challenge: how to select which optimizations to perform and how to share their cost among users. The problem is especially challenging when users are selfish and will only report their true values for different optimizations if doing so maximizes their utility. In this paper, we present a new approach for selecting and pricing shared optimizations by using Mechanism Design. We first show how to apply the Shapley Value Mechanism to the simple case of selecting and pricing additive optimizations, assuming an offline game where all users access the service for the same time-period. Second, we extend the approach to online scenarios where users come and go. Finally, we consider the case of substitutive optimizations. We show analytically that our mechanisms induce truthfulness and recover the optimization costs. We also show experimentally that our mechanisms yield higher utility than the state-of-the-art approach based on regret accumulation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Angel:2012:DSM, author = "Albert Angel and Nikos Sarkas and Nick Koudas and Divesh Srivastava", title = "Dense subgraph maintenance under streaming edge weight updates for real-time story identification", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "6", pages = "574--585", month = feb, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:15 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent years have witnessed an unprecedented proliferation of social media. People around the globe author, every day, millions of blog posts, micro-blog posts, social network status updates, etc. This rich stream of information can be used to identify, on an ongoing basis, emerging stories, and events that capture popular attention. Stories can be identified via groups of tightly-coupled real-world entities, namely the people, locations, products, etc., that are involved in the story. The sheer scale, and rapid evolution of the data involved necessitate highly efficient techniques for identifying important stories at every point of time. The main challenge in real-time story identification is the maintenance of dense subgraphs (corresponding to groups of tightly-coupled entities) under streaming edge weight updates (resulting from a stream of user-generated content). This is the first work to study the efficient maintenance of dense subgraphs under such streaming edge weight updates. For a wide range of definitions of density, we derive theoretical results regarding the magnitude of change that a single edge weight update can cause. Based on these, we propose a novel algorithm, DynDens, which outperforms adaptations of existing techniques to this setting, and yields meaningful results. Our approach is validated by a thorough experimental evaluation on large-scale real and synthetic datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Elghandour:2012:RRR, author = "Iman Elghandour and Ashraf Aboulnaga", title = "{ReStore}: reusing results of {MapReduce} jobs", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "6", pages = "586--597", month = feb, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 24 07:52:15 MDT 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Analyzing large scale data has emerged as an important activity for many organizations in the past few years. This large scale data analysis is facilitated by the MapReduce programming and execution model and its implementations, most notably Hadoop. Users of MapReduce often have analysis tasks that are too complex to express as individual MapReduce jobs. Instead, they use high-level query languages such as Pig, Hive, or Jaql to express their complex tasks. The compilers of these languages translate queries into workflows of MapReduce jobs. Each job in these workflows reads its input from the distributed file system used by the MapReduce system and produces output that is stored in this distributed file system and read as input by the next job in the workflow. The current practice is to delete these intermediate results from the distributed file system at the end of executing the workflow. One way to improve the performance of workflows of MapReduce jobs is to keep these intermediate results and reuse them for future workflows submitted to the system. In this paper, we present ReStore, a system that manages the storage and reuse of such intermediate results. ReStore can reuse the output of whole MapReduce jobs that are part of a workflow, and it can also create additional reuse opportunities by materializing and storing the output of query execution operators that are executed within a MapReduce job. We have implemented ReStore as an extension to the Pig dataflow system on top of Hadoop, and we experimentally demonstrate significant speedups on queries from the PigMix benchmark.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Khoussainova:2012:PDM, author = "Nodira Khoussainova and Magdalena Balazinska and Dan Suciu", title = "{PerfXplain}: debugging {MapReduce} job performance", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "7", pages = "598--609", month = mar, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:09 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "While users today have access to many tools that assist in performing large scale data analysis tasks, understanding the performance characteristics of their parallel computations, such as MapReduce jobs, remains difficult. We present PerfXplain, a system that enables users to ask questions about the relative performances (i.e., runtimes) of pairs of MapReduce jobs. PerfXplain provides a new query language for articulating performance queries and an algorithm for generating explanations from a log of past MapReduce job executions. We formally define the notion of an explanation together with three metrics, relevance, precision, and generality, that measure explanation quality. We present the explanation-generation algorithm based on techniques related to decision-tree building. We evaluate the approach on a log of past executions on Amazon EC2, and show that our approach can generate quality explanations, outperforming two na{\"\i}ve explanation-generation methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gullo:2012:UCB, author = "Francesco Gullo and Andrea Tagarelli", title = "Uncertain centroid based partitional clustering of uncertain data", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "7", pages = "610--621", month = mar, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:09 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Clustering uncertain data has emerged as a challenging task in uncertain data management and mining. Thanks to a computational complexity advantage over other clustering paradigms, partitional clustering has been particularly studied and a number of algorithms have been developed. While existing proposals differ mainly in the notions of cluster centroid and clustering objective function, little attention has been given to an analysis of their characteristics and limits. In this work, we theoretically investigate major existing methods of partitional clustering, and alternatively propose a well-founded approach to clustering uncertain data based on a novel notion of cluster centroid. A cluster centroid is seen as an uncertain object defined in terms of a random variable whose realizations are derived based on all deterministic representations of the objects to be clustered. As demonstrated theoretically and experimentally, this allows for better representing a cluster of uncertain objects, thus supporting a consistently improved clustering performance while maintaining comparable efficiency with existing partitional clustering algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bahmani:2012:SM, author = "Bahman Bahmani and Benjamin Moseley and Andrea Vattani and Ravi Kumar and Sergei Vassilvitskii", title = "Scalable $k$-means$ + + $", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "7", pages = "622--633", month = mar, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:09 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Over half a century old and showing no signs of aging, $k$-means remains one of the most popular data processing algorithms. As is well-known, a proper initialization of $k$-means is crucial for obtaining a good final solution. The recently proposed $k$-means++ initialization algorithm achieves this, obtaining an initial set of centers that is provably close to the optimum solution. A major downside of the $k$-means++ is its inherent sequential nature, which limits its applicability to massive data: one must make $k$ passes over the data to find a good initial set of centers. In this work we show how to drastically reduce the number of passes needed to obtain, in parallel, a good initialization. This is unlike prevailing efforts on parallelizing $k$-means that have mostly focused on the post-initialization phases of $k$-means. We prove that our proposed initialization algorithm $k$-means|| obtains a nearly optimal solution after a logarithmic number of passes, and then show that in practice a constant number of passes suffices. Experimental evaluation on real-world large-scale data demonstrates that $k$-means|| outperforms $k$-means++ in both sequential and parallel settings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Benedikt:2012:QSA, author = "Michael Benedikt and Pierre Bourhis and Clemens Ley", title = "Querying schemas with access restrictions", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "7", pages = "634--645", month = mar, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:09 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study verification of systems whose transitions consist of accesses to a Web-based data-source. An access is a lookup on a relation within a relational database, fixing values for a set of positions in the relation. For example, a transition can represent access to a Web form, where the user is restricted to filling in values for a particular set of fields. We look at verifying properties of a schema describing the possible accesses of such a system. We present a language where one can describe the properties of an access path, and also specify additional restrictions on accesses that are enforced by the schema. Our main property language, AccLTL, is based on a first-order extension of linear-time temporal logic, interpreting access paths as sequences of relational structures. We also present a lower-level automaton model, A-automata, which AccLTL specifications can compile into. We show that AccLTL and A-automata can express static analysis problems related to ``querying with limited access patterns'' that have been studied in the database literature in the past, such as whether an access is relevant to answering a query, and whether two queries are equivalent in the accessible data they can return. We prove decidability and complexity results for several restrictions and variants of AccLTL, and explain which properties of paths can be expressed in each restriction.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Graefe:2012:DDR, author = "Goetz Graefe and Harumi Kuno", title = "Definition, detection, and recovery of single-page failures, a fourth class of database failures", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "7", pages = "646--655", month = mar, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:09 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The three traditional failure classes are system, media, and transaction failures. Sometimes, however, modern storage exhibits failures that differ from all of those. In order to capture and describe such cases, single-page failures are introduced as a fourth failure class. This class encompasses all failures to read a data page correctly and with plausible contents despite all correction attempts in lower system levels. Efficient recovery seems to require a new data structure called the page recovery index. Its transactional maintenance can be accomplished writing the same number of log records as today's efficient implementations of logging and recovery. Detection and recovery of a single-page failure can be sufficiently fast that the affected data access is merely delayed, without the need to abort the transaction.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Graefe:2012:CCA, author = "Goetz Graefe and Felix Halim and Stratos Idreos and Harumi Kuno and Stefan Manegold", title = "Concurrency control for adaptive indexing", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "7", pages = "656--667", month = mar, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:09 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Adaptive indexing initializes and optimizes indexes incrementally, as a side effect of query processing. The goal is to achieve the benefits of indexes while hiding or minimizing the costs of index creation. However, index-optimizing side effects seem to turn read-only queries into update transactions that might, for example, create lock contention. This paper studies concurrency control in the context of adaptive indexing. We show that the design and implementation of adaptive indexing rigorously separates index structures from index contents; this relaxes the constraints and requirements during adaptive indexing compared to those of traditional index updates. Our design adapts to the fact that an adaptive index is refined continuously, and exploits any concurrency opportunities in a dynamic way. A detailed experimental analysis demonstrates that (a) adaptive indexing maintains its adaptive properties even when running concurrent queries, (b) adaptive indexing can exploit the opportunity for parallelism due to concurrent queries, (c) the number of concurrency conflicts and any concurrency administration overheads follow an adaptive behavior, decreasing as the workload evolves and adapting to the workload needs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zeng:2012:CSB, author = "Qiang Zeng and Hai Zhuge", title = "Comments on {``Stack-based Algorithms for Pattern Matching on DAGs''}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "7", pages = "668--679", month = mar, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:09 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The paper ``Stack-based Algorithms for Pattern Matching on DAGs'' generalizes the classical holistic twig join algorithms and proposes PathStackD, TwigStackD and DagStackD to respectively evaluate path, twig and DAG pattern queries on directed acyclic graphs. In this paper, we investigate the major results of that paper, pointing out several discrepancies and proposing solutions to resolving them. We show that the original algorithms do not find particular types of query solutions that are common in practice. We also analyze the effect of an underlying assumption on the correctness of the algorithms and discuss the pre-filtering process that the original work proposes to prune redundant nodes. Our experimental study on both real and synthetic data substantiates our conclusions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dalvi:2012:ASD, author = "Nilesh Dalvi and Ashwin Machanavajjhala and Bo Pang", title = "An analysis of structured data on the web", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "7", pages = "680--691", month = mar, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:09 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we analyze the nature and distribution of structured data on the Web. Web-scale information extraction, or the problem of creating structured tables using extraction from the entire web, is gathering lots of research interest. We perform a study to understand and quantify the value of Web-scale extraction, and how structured information is distributed amongst top aggregator websites and tail sites for various interesting domains. We believe this is the first study of its kind, and gives us new insights for information extraction over the Web.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mouratidis:2012:SPC, author = "Kyriakos Mouratidis and Man Lung Yiu", title = "Shortest path computation with no information leakage", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "8", pages = "692--703", month = apr, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:10 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Shortest path computation is one of the most common queries in location-based services (LBSs). Although particularly useful, such queries raise serious privacy concerns. Exposing to a (potentially untrusted) LBS the client's position and her destination may reveal personal information, such as social habits, health condition, shopping preferences, lifestyle choices, etc. The only existing method for privacy-preserving shortest path computation follows the obfuscation paradigm; it prevents the LBS from inferring the source and destination of the query with a probability higher than a threshold. This implies, however, that the LBS still deduces some information (albeit not exact) about the client's location and her destination. In this paper we aim at strong privacy, where the adversary learns nothing about the shortest path query. We achieve this via established private information retrieval techniques, which we treat as black-box building blocks. Experiments on real, large-scale road networks assess the practicality of our schemes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Metwally:2012:VSJ, author = "Ahmed Metwally and Christos Faloutsos", title = "{V-SMART-join}: a scalable {MapReduce} framework for all-pair similarity joins of multisets and vectors", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "8", pages = "704--715", month = apr, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:10 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This work proposes V-SMART-Join, a scalable MapReduce-based framework for discovering all pairs of similar entities. The V-SMART-Join framework is applicable to sets, multisets, and vectors. V-SMART-Join is motivated by the observed skew in the underlying distributions of Internet traffic, and is a family of 2-stage algorithms, where the first stage computes and joins the partial results, and the second stage computes the similarity exactly for all candidate pairs. The V-SMART-Join algorithms are very efficient and scalable in the number of entities, as well as their cardinalities. They were up to 30 times faster than the state of the art algorithm, VCL, when compared on a real dataset of a small size. We also established the scalability of the proposed algorithms by running them on a dataset of a realistic size, on which VCL never succeeded to finish. Experiments were run using real datasets of IPs and cookies, where each IP is represented as a multiset of cookies, and the goal is to discover similar IPs to identify Internet proxies.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Low:2012:DGF, author = "Yucheng Low and Danny Bickson and Joseph Gonzalez and Carlos Guestrin and Aapo Kyrola and Joseph M. Hellerstein", title = "{Distributed GraphLab}: a framework for machine learning and data mining in the cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "8", pages = "716--727", month = apr, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:10 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "While high-level data parallel frameworks, like MapReduce, simplify the design and implementation of large-scale data processing systems, they do not naturally or efficiently support many important data mining and machine learning algorithms and can lead to inefficient learning systems. To help fill this critical void, we introduced the GraphLab abstraction which naturally expresses asynchronous, dynamic, graph-parallel computation while ensuring data consistency and achieving a high degree of parallel performance in the shared-memory setting. In this paper, we extend the GraphLab framework to the substantially more challenging distributed setting while preserving strong data consistency guarantees. We develop graph based extensions to pipelined locking and data versioning to reduce network congestion and mitigate the effect of network latency. We also introduce fault tolerance to the GraphLab abstraction using the classic Chandy-Lamport snapshot algorithm and demonstrate how it can be easily implemented by exploiting the GraphLab abstraction itself. Finally, we evaluate our distributed implementation of the GraphLab abstraction on a large Amazon EC2 deployment and show 1-2 orders of magnitude performance gains over Hadoop-based implementations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zeng:2012:ALO, author = "Qiang Zeng and Xiaorui Jiang and Hai Zhuge", title = "Adding logical operators to tree pattern queries on graph-structured data", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "8", pages = "728--739", month = apr, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:10 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As data are increasingly modeled as graphs for expressing complex relationships, the tree pattern query on graph-structured data becomes an important type of queries in real-world applications. Most practical query languages, such as XQuery and SPARQL, support logical expressions using logical-AND/OR/NOT operators to define structural constraints of tree patterns. In this paper, (1) we propose generalized tree pattern queries (GTPQs) over graph-structured data, which fully support propositional logic of structural constraints. (2) We make a thorough study of fundamental problems including satisfiability, containment and minimization, and analyze the computational complexity and the decision procedures of these problems. (3) We propose a compact graph representation of intermediate results and a pruning approach to reduce the size of intermediate results and the number of join operations --- two factors that often impair the efficiency of traditional algorithms for evaluating tree pattern queries. (4) We present an efficient algorithm for evaluating GTPQs using 3-hop as the underlying reachability index. (5) Experiments on both real-life and synthetic data sets demonstrate the effectiveness and efficiency of our algorithm, from several times to orders of magnitude faster than state-of-the-art algorithms in terms of evaluation time, even for traditional tree pattern queries with only conjunctive operations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Singh:2012:LSS, author = "Rishabh Singh and Sumit Gulwani", title = "Learning semantic string transformations from examples", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "8", pages = "740--751", month = apr, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:10 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We address the problem of performing semantic transformations on strings, which may represent a variety of data types (or their combination) such as a column in a relational table, time, date, currency, etc. Unlike syntactic transformations, which are based on regular expressions and which interpret a string as a sequence of characters, semantic transformations additionally require exploiting the semantics of the data type represented by the string, which may be encoded as a database of relational tables. Manually performing such transformations on a large collection of strings is error prone and cumbersome, while programmatic solutions are beyond the skill-set of end-users. We present a programming by example technology that allows end-users to automate such repetitive tasks. We describe an expressive transformation language for semantic manipulation that combines table lookup operations and syntactic manipulations. We then present a synthesis algorithm that can learn all transformations in the language that are consistent with the user-provided set of input-output examples. We have implemented this technology as an add-in for the Microsoft Excel Spreadsheet system and have evaluated it successfully over several benchmarks picked from various Excel help-forums.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2012:CDD, author = "Changbin Liu and Lu Ren and Boon Thau Loo and Yun Mao and Prithwish Basu", title = "{Cologne}: a declarative distributed constraint optimization platform", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "8", pages = "752--763", month = apr, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:10 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents Cologne, a declarative optimization platform that enables constraint optimization problems (COPs) to be declaratively specified and incrementally executed in distributed systems. Cologne integrates a declarative networking engine with an off-the-shelf constraint solver. We have developed the Colog language that combines distributed Datalog used in declarative networking with language constructs for specifying goals and constraints used in COPs. Cologne uses novel query processing strategies for processing Colog programs, by combining the use of bottom-up distributed Datalog evaluation with top-down goal-oriented constraint solving. Using case studies based on cloud and wireless network optimizations, we demonstrate that Cologne (1) can flexibly support a wide range of policy-based optimizations in distributed systems, (2) results in orders of magnitude less code compared to imperative implementations, and (3) is highly efficient with low overhead and fast convergence times.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2012:OBA, author = "Yi Zhang and Jun Yang", title = "Optimizing {I/O} for big array analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "8", pages = "764--775", month = apr, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:10 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Big array analytics is becoming indispensable in answering important scientific and business questions. Most analysis tasks consist of multiple steps, each making one or multiple passes over the arrays to be analyzed and generating intermediate results. In the big data setting, I/O optimization is a key to efficient analytics. In this paper, we develop a framework and techniques for capturing a broad range of analysis tasks expressible in nested-loop forms, representing them in a declarative way, and optimizing their I/O by identifying sharing opportunities. Experiment results show that our optimizer is capable of finding execution plans that exploit nontrivial I/O sharing opportunities with significant savings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bailis:2012:PBS, author = "Peter Bailis and Shivaram Venkataraman and Michael J. Franklin and Joseph M. Hellerstein and Ion Stoica", title = "Probabilistically bounded staleness for practical partial quorums", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "8", pages = "776--787", month = apr, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:10 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data store replication results in a fundamental trade-off between operation latency and data consistency. In this paper, we examine this trade-off in the context of quorum-replicated data stores. Under partial, or non-strict quorum replication, a data store waits for responses from a subset of replicas before answering a query, without guaranteeing that read and write replica sets intersect. As deployed in practice, these configurations provide only basic eventual consistency guarantees, with no limit to the recency of data returned. However, anecdotally, partial quorums are often ``good enough'' for practitioners given their latency benefits. In this work, we explain why partial quorums are regularly acceptable in practice, analyzing both the staleness of data they return and the latency benefits they offer. We introduce Probabilistically Bounded Staleness (PBS) consistency, which provides expected bounds on staleness with respect to both versions and wall clock time. We derive a closed-form solution for versioned staleness as well as model real-time staleness for representative Dynamo-style systems under internet-scale production workloads. Using PBS, we measure the latency-consistency trade-off for partial quorum systems. We quantitatively demonstrate how eventually consistent systems frequently return consistent data within tens of milliseconds while offering significant latency benefits.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sun:2012:ESM, author = "Zhao Sun and Hongzhi Wang and Haixun Wang and Bin Shao and Jianzhong Li", title = "Efficient subgraph matching on billion node graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "9", pages = "788--799", month = may, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:11 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The ability to handle large scale graph data is crucial to an increasing number of applications. Much work has been dedicated to supporting basic graph operations such as subgraph matching, reachability, regular expression matching, etc. In many cases, graph indices are employed to speed up query processing. Typically, most indices require either super-linear indexing time or super-linear indexing space. Unfortunately, for very large graphs, super-linear approaches are almost always infeasible. In this paper, we study the problem of subgraph matching on billion-node graphs. We present a novel algorithm that supports efficient subgraph matching for graphs deployed on a distributed memory store. Instead of relying on super-linear indices, we use efficient graph exploration and massive parallel computing for query processing. Our experimental results demonstrate the feasibility of performing subgraph matching on web-scale graph data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yuan:2012:ESS, author = "Ye Yuan and Guoren Wang and Lei Chen and Haixun Wang", title = "Efficient subgraph similarity search on large probabilistic graph databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "9", pages = "800--811", month = may, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:11 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many studies have been conducted on seeking the efficient solution for subgraph similarity search over certain (deterministic) graphs due to its wide application in many fields, including bioinformatics, social network analysis, and Resource Description Framework (RDF) data management. All these works assume that the underlying data are certain. However, in reality, graphs are often noisy and uncertain due to various factors, such as errors in data extraction, inconsistencies in data integration, and privacy preserving purposes. Therefore, in this paper, we study subgraph similarity search on large probabilistic graph databases. Different from previous works assuming that edges in an uncertain graph are independent of each other, we study the uncertain graphs where edges' occurrences are correlated. We formally prove that subgraph similarity search over probabilistic graphs is \#P-complete, thus, we employ a filter-and-verify framework to speed up the search. In the filtering phase, we develop tight lower and upper bounds of subgraph similarity probability based on a probabilistic matrix index, PMI. PMI is composed of discriminative subgraph features associated with tight lower and upper bounds of subgraph isomorphism probability. Based on PMI, we can sort out a large number of probabilistic graphs and maximize the pruning capability. During the verification phase, we develop an efficient sampling algorithm to validate the remaining candidates. The efficiency of our proposed solutions has been verified through extensive experiments.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2012:TDM, author = "Jia Wang and James Cheng", title = "Truss decomposition in massive networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "9", pages = "812--823", month = may, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:11 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The $k$-truss is a type of cohesive subgraphs proposed recently for the study of networks. While the problem of computing most cohesive subgraphs is NP-hard, there exists a polynomial time algorithm for computing $k$-truss. Compared with $k$-core which is also efficient to compute, $k$-truss represents the ``core'' of a $k$-core that keeps the key information of, while filtering out less important information from, the $k$-core. However, existing algorithms for computing $k$-truss are inefficient for handling today's massive networks. We first improve the existing in-memory algorithm for computing $k$-truss in networks of moderate size. Then, we propose two I/O-efficient algorithms to handle massive networks that cannot fit in main memory. Our experiments on real datasets verify the efficiency of our algorithms and the value of $k$-truss.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2012:SST, author = "Ju Fan and Guoliang Li and Lizhu Zhou and Shanshan Chen and Jun Hu", title = "{Seal}: spatio-textual similarity search", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "9", pages = "824--835", month = may, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:11 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Location-based services (LBS) have become more and more ubiquitous recently. Existing methods focus on finding relevant points-of-interest (POIs) based on users' locations and query keywords. Nowadays, modern LBS applications generate a new kind of spatio-textual data, regions-of-interest (ROIs), containing region-based spatial information and textual description, e.g., mobile user profiles with active regions and interest tags. To satisfy search requirements on ROIs, we study a new research problem, called spatio-textual similarity search: Given a set of ROIs and a query ROI, we find the similar ROIs by considering spatial overlap and textual similarity. Spatio-textual similarity search has many important applications, e.g., social marketing in location-aware social networks. It calls for an efficient search method to support large scales of spatio-textual data in LBS systems. To this end, we introduce a filter-and-verification framework to compute the answers. In the filter step, we generate signatures for the ROIs and the query, and utilize the signatures to generate candidates whose signatures are similar to that of the query. In the verification step, we verify the candidates and identify the final answers. To achieve high performance, we generate effective high-quality signatures, and devise efficient filtering algorithms as well as pruning techniques. Experimental results on real and synthetic datasets show that our method achieves high performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lappas:2012:SBT, author = "Theodoros Lappas and Marcos R. Vieira and Dimitrios Gunopulos and Vassilis J. Tsotras", title = "On the spatiotemporal burstiness of terms", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "9", pages = "836--847", month = may, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:11 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Thousands of documents are made available to the users via the web on a daily basis. One of the most extensively studied problems in the context of such document streams is burst identification. Given a term t, a burst is generally exhibited when an unusually high frequency is observed for t. While spatial and temporal burstiness have been studied individually in the past, our work is the first to simultaneously track and measure spatiotemporal term burstiness. In addition, we use the mined burstiness information toward an efficient document-search engine: given a user's query of terms, our engine returns a ranked list of documents discussing influential events with a strong spatiotemporal impact. We demonstrate the efficiency of our methods with an extensive experimental evaluation on real and synthetic datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shirani-Mehr:2012:ERQ, author = "Houtan Shirani-Mehr and Farnoush Banaei-Kashani and Cyrus Shahabi", title = "Efficient reachability query evaluation in large spatiotemporal contact datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "9", pages = "848--859", month = may, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:11 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the advent of reliable positioning technologies and prevalence of location-based services, it is now feasible to accurately study the propagation of items such as infectious viruses, sensitive information pieces, and malwares through a population of moving objects, e.g., individuals, mobile devices, and vehicles. In such application scenarios, an item passes between two objects when the objects are sufficiently close (i.e., when they are, so-called, in contact), and hence once an item is initiated, it can penetrate the object population through the evolving network of contacts among objects, termed contact network. In this paper, for the first time we define and study reachability queries in large (i.e., disk-resident) contact datasets which record the movement of a (potentially large) set of objects moving in a spatial environment over an extended time period. A reachability query verifies whether two objects are ``reachable'' through the evolving contact network represented by such contact datasets. We propose two contact-dataset indexes that enable efficient evaluation of such queries despite the potentially humongous size of the contact datasets. With the first index, termed ReachGrid, at the query time only a small necessary portion of the contact network which is required for reachability evaluation is constructed and traversed. With the second approach, termed ReachGraph, we precompute reachability at different scales and leverage these precalculations at the query time for efficient query processing. We optimize the placement of both indexes on disk to enable efficient index traversal during query processing. We study the pros and cons of our proposed approaches by performing extensive experiments with both real and synthetic data. Based on our experimental results, our proposed approaches outperform existing reachability query processing techniques in contact networks by 76\% on average.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nguyen:2012:BMO, author = "Thi Nguyen and Zhen He and Rui Zhang and Phillip Ward", title = "Boosting moving object indexing through velocity partitioning", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "9", pages = "860--871", month = may, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:11 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "There have been intense research interests in moving object indexing in the past decade. However, existing work did not exploit the important property of skewed velocity distributions. In many real world scenarios, objects travel predominantly along only a few directions. Examples include vehicles on road networks, flights, people walking on the streets, etc. The search space for a query is heavily dependent on the velocity distribution of the objects grouped in the nodes of an index tree. Motivated by this observation, we propose the velocity partitioning (VP) technique, which exploits the skew in velocity distribution to speed up query processing using moving object indexes. The VP technique first identifies the ``dominant velocity axes (DVAs)'' using a combination of principal components analysis (PCA) and $k$-means clustering. Then, a moving object index (e.g., a TPR-tree) is created based on each DVA, using the DVA as an axis of the underlying coordinate system. An object is maintained in the index whose DVA is closest to the object's current moving direction. Thus, all the objects in an index are moving in a near 1-dimensional space instead of a 2-dimensional space. As a result, the expansion of the search space with time is greatly reduced, from a quadratic function of the maximum speed (of the objects in the search range) to a near linear function of the maximum speed. The VP technique can be applied to a wide range of moving object index structures. We have implemented the VP technique on two representative ones, the TPR*-tree and the B$^x$-tree. Extensive experiments validate that the VP technique consistently improves the performance of those index structures.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bidoit-Tollu:2012:TBD, author = "Nicole Bidoit-Tollu and Dario Colazzo and Federico Ulliana", title = "Type-based detection of {XML} query-update independence", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "9", pages = "872--883", month = may, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:11 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents a novel static analysis technique to detect XML query-update independence, in the presence of a schema. Rather than types, our system infers chains of types. Each chain represents a path that can be traversed on a valid document during query/update evaluation. The resulting independence analysis is precise, although it raises a challenging issue: recursive schemas may lead to inference of infinitely many chains. A sound and complete approximation technique ensuring a finite analysis in any case is presented, together with an efficient implementation performing the chain-based analysis in polynomial space and time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sowell:2012:MSD, author = "Benjamin Sowell and Wojciech Golab and Mehul A. Shah", title = "{Minuet}: a scalable distributed multiversion {B}-tree", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "9", pages = "884--895", month = may, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:11 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data management systems have traditionally been designed to support either long-running analytics queries or short-lived transactions, but an increasing number of applications need both. For example, online games, socio-mobile apps, and e-commerce sites need to not only maintain operational state, but also analyze that data quickly to make predictions and recommendations that improve user experience. In this paper, we present Minuet, a distributed, main-memory B-tree that supports both transactions and copy-on-write snapshots for in-situ analytics. Minuet uses main-memory storage to enable low-latency transactional operations as well as analytics queries without compromising transaction performance. In addition to supporting read-only analytics queries on snapshots, Minuet supports writable clones, so that users can create branching versions of the data. This feature can be quite useful, e.g. to support complex ``what-if'' analysis or to facilitate wide-area replication. Our experiments show that Minuet outperforms a commercial main-memory database in many ways. It scales to hundreds of cores and TBs of memory, and can process hundreds of thousands of B-tree operations per second while executing long-running scans.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yin:2012:CLT, author = "Hongzhi Yin and Bin Cui and Jing Li and Junjie Yao and Chen Chen", title = "Challenging the long tail recommendation", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "9", pages = "896--907", month = may, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:11 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The success of ``infinite-inventory'' retailers such as Amazon.com and Netflix has been largely attributed to a ``long tail'' phenomenon. Although the majority of their inventory is not in high demand, these niche products, unavailable at limited-inventory competitors, generate a significant fraction of total revenue in aggregate. In addition, tail product availability can boost head sales by offering consumers the convenience of ``one-stop shopping'' for both their mainstream and niche tastes. However, most of existing recommender systems, especially collaborative filter based methods, can not recommend tail products due to the data sparsity issue. It has been widely acknowledged that to recommend popular products is easier yet more trivial while to recommend long tail products adds more novelty yet it is also a more challenging task. In this paper, we propose a novel suite of graph-based algorithms for the long tail recommendation. We first represent user-item information with undirected edge-weighted graph and investigate the theoretical foundation of applying Hitting Time algorithm for long tail item recommendation. To improve recommendation diversity and accuracy, we extend Hitting Time and propose efficient Absorbing Time algorithm to help users find their favorite long tail items. Finally, we refine the Absorbing Time algorithm and propose two entropy-biased Absorbing Cost algorithms to distinguish the variation on different user-item rating pairs, which further enhances the effectiveness of long tail recommendation. Empirical experiments on two real life datasets show that our proposed algorithms are effective to recommend long tail items and outperform state-of-the-art recommendation techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pimplikar:2012:ATQ, author = "Rakesh Pimplikar and Sunita Sarawagi", title = "Answering table queries on the {Web} using column keywords", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "10", pages = "908--919", month = jun, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:13 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present the design of a structured search engine which returns a multi-column table in response to a query consisting of keywords describing each of its columns. We answer such queries by exploiting the millions of tables on the Web because these are much richer sources of structured knowledge than free-format text. However, a corpus of tables harvested from arbitrary HTML web pages presents huge challenges of diversity and redundancy not seen in centrally edited knowledge bases. We concentrate on one concrete task in this paper. Given a set of Web tables T$_1$,\ldots{}, T$_n$, and a query Q with q sets of keywords Q$_1$,\ldots{}, Q$_q$, decide for each T$_i$ if it is relevant to Q and if so, identify the mapping between the columns of T$_i$ and query columns. We represent this task as a graphical model that jointly maps all tables by incorporating diverse sources of clues spanning matches in different parts of the table, corpus-wide co-occurrence statistics, and content overlap across table columns. We define a novel query segmentation model for matching keywords to table columns, and a robust mechanism of exploiting content overlap across table columns. We design efficient inference algorithms based on bipartite matching and constrained graph cuts to solve the joint labeling task. Experiments on a workload of 59 queries over a 25 million web table corpus shows significant boost in accuracy over baseline IR methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Goodrich:2012:EVW, author = "Michael T. Goodrich and Charalampos Papamanthou and Duy Nguyen and Roberto Tamassia and Cristina Videira Lopes and Olga Ohrimenko and Nikos Triandopoulos", title = "Efficient verification of web-content searching through authenticated web crawlers", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "10", pages = "920--931", month = jun, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:13 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We consider the problem of verifying the correctness and completeness of the result of a keyword search. We introduce the concept of an authenticated web crawler and present its design and prototype implementation. An authenticated web crawler is a trusted program that computes a specially-crafted signature over the web contents it visits. This signature enables (i) the verification of common Internet queries on web pages, such as conjunctive keyword searches---this guarantees that the output of a conjunctive keyword search is correct and complete; (ii) the verification of the content returned by such Internet queries---this guarantees that web data is authentic and has not been maliciously altered since the computation of the signature by the crawler. In our solution, the search engine returns a cryptographic proof of the query result. Both the proof size and the verification time are proportional only to the sizes of the query description and the query result, but do not depend on the number or sizes of the web pages over which the search is performed. As we experimentally demonstrate, the prototype implementation of our system provides a low communication overhead between the search engine and the user, and fast verification of the returned results by the user.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Blunschi:2012:SGS, author = "Lukas Blunschi and Claudio Jossen and Donald Kossmann and Magdalini Mori and Kurt Stockinger", title = "{SODA}: generating {SQL} for business users", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "10", pages = "932--943", month = jun, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:13 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The purpose of data warehouses is to enable business analysts to make better decisions. Over the years the technology has matured and data warehouses have become extremely successful. As a consequence, more and more data has been added to the data warehouses and their schemas have become increasingly complex. These systems still work great in order to generate pre-canned reports. However, with their current complexity, they tend to be a poor match for non tech-savvy business analysts who need answers to ad-hoc queries that were not anticipated. This paper describes the design, implementation, and experience of the SODA system (Search over DAta Warehouse). SODA bridges the gap between the business needs of analysts and the technical complexity of current data warehouses. SODA enables a Google-like search experience for data warehouses by taking keyword queries of business users and automatically generating executable SQL. The key idea is to use a graph pattern matching algorithm that uses the metadata model of the data warehouse. Our results with real data from a global player in the financial services industry show that SODA produces queries with high precision and recall, and makes it much easier for business users to interactively explore highly-complex data warehouses.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Terrovitis:2012:PPD, author = "Manolis Terrovitis and Nikos Mamoulis and John Liagouris and Spiros Skiadopoulos", title = "Privacy preservation by disassociation", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "10", pages = "944--955", month = jun, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:13 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this work, we focus on protection against identity disclosure in the publication of sparse multidimensional data. Existing multidimensional anonymization techniques (a) protect the privacy of users either by altering the set of quasi-identifiers of the original data (e.g., by generalization or suppression) or by adding noise (e.g., using differential privacy) and/or (b) assume a clear distinction between sensitive and non-sensitive information and sever the possible linkage. In many real world applications the above techniques are not applicable. For instance, consider web search query logs. Suppressing or generalizing anonymization methods would remove the most valuable information in the dataset: the original query terms. Additionally, web search query logs contain millions of query terms which cannot be categorized as sensitive or non-sensitive since a term may be sensitive for a user and non-sensitive for another. Motivated by this observation, we propose an anonymization technique termed disassociation that preserves the original terms but hides the fact that two or more different terms appear in the same record. We protect the users' privacy by disassociating record terms that participate in identifying combinations. This way the adversary cannot associate with high probability a record with a rare combination of terms. To the best of our knowledge, our proposal is the first to employ such a technique to provide protection against identity disclosure. We propose an anonymization algorithm based on our approach and evaluate its performance on real and synthetic datasets, comparing it against other state-of-the-art methods based on generalization and differential privacy.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kanagal:2012:SRS, author = "Bhargav Kanagal and Amr Ahmed and Sandeep Pandey and Vanja Josifovski and Jeff Yuan and Lluis Garcia-Pueyo", title = "Supercharging recommender systems using taxonomies for learning user purchase behavior", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "10", pages = "956--967", month = jun, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:13 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recommender systems based on latent factor models have been effectively used for understanding user interests and predicting future actions. Such models work by projecting the users and items into a smaller dimensional space, thereby clustering similar users and items together and subsequently compute similarity between unknown user-item pairs. When user-item interactions are sparse (sparsity problem) or when new items continuously appear (cold start problem), these models perform poorly. In this paper, we exploit the combination of taxonomies and latent factor models to mitigate these issues and improve recommendation accuracy. We observe that taxonomies provide structure similar to that of a latent factor model: namely, it imposes human-labeled categories (clusters) over items. This leads to our proposed taxonomy-aware latent factor model (TF) which combines taxonomies and latent factors using additive models. We develop efficient algorithms to train the TF models, which scales to large number of users/items and develop scalable inference/recommendation algorithms by exploiting the structure of the taxonomy. In addition, we extend the TF model to account for the temporal dynamics of user interests using high-order Markov chains. To deal with large-scale data, we develop a parallel multi-core implementation of our TF model. We empirically evaluate the TF model for the task of predicting user purchases using a real-world shopping dataset spanning more than a million users and products. Our experiments demonstrate the benefits of using our TF models over existing approaches, in terms of both prediction accuracy and running time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ahmad:2012:DHO, author = "Yanif Ahmad and Oliver Kennedy and Christoph Koch and Milos Nikolic", title = "{DBToaster}: higher-order delta processing for dynamic, frequently fresh views", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "10", pages = "968--979", month = jun, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:13 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Applications ranging from algorithmic trading to scientific data analysis require realtime analytics based on views over databases that change at very high rates. Such views have to be kept fresh at low maintenance cost and latencies. At the same time, these views have to support classical SQL, rather than window semantics, to enable applications that combine current with aged or historical data. In this paper, we present viewlet transforms, a recursive finite differencing technique applied to queries. The viewlet transform materializes a query and a set of its higher-order deltas as views. These views support each other's incremental maintenance, leading to a reduced overall view maintenance cost. The viewlet transform of a query admits efficient evaluation, the elimination of certain expensive query operations, and aggressive parallelization. We develop viewlet transforms into a workable query execution technique, present a heuristic and cost-based optimization framework, and report on experiments with a prototype dynamic data management system that combines viewlet transforms with an optimizing compilation technique. The system supports tens of thousands of complete view refreshes a second for a wide range of queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Agarwal:2012:RTD, author = "Manoj K. Agarwal and Krithi Ramamritham and Manish Bhide", title = "Real time discovery of dense clusters in highly dynamic graphs: identifying real world events in highly dynamic environments", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "10", pages = "980--991", month = jun, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:13 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Due to their real time nature, microblog streams are a rich source of dynamic information, for example, about emerging events. Existing techniques for discovering such events from a microblog stream in real time (such as Twitter trending topics), have several lacunae when used for discovering emerging events; extant graph based event detection techniques are not practical in microblog settings due to their complexity; and conventional techniques, which have been developed for blogs, web-pages, etc., involving the use of keyword search, are only useful for finding information about known events. Hence, in this paper, we present techniques to discover events that are unraveling in microblog message streams in real time so that such events can be reported as soon as they occur. We model the problem as discovering dense clusters in highly dynamic graphs. Despite many recent advances in graph analysis, ours is the first technique to identify dense clusters in massive and highly dynamic graphs in real time. Given the characteristics of microblog streams, in order to find clusters without missing any events, we propose and exploit a novel graph property which we call short-cycle property. Our algorithms find these clusters efficiently in spite of rapid changes to the microblog streams. Further we present a novel ranking function to identify the important events. Besides proving the correctness of our algorithms we show their practical utility by evaluating them using real world microblog data. These demonstrate our technique's ability to discover, with high precision and recall, emerging events in high intensity data streams in real time. Many recent web applications create data which can be represented as massive dynamic graphs. Our technique can be easily extended to discover, in real time, interesting patterns in such graphs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Papapetrou:2012:SBQ, author = "Odysseas Papapetrou and Minos Garofalakis and Antonios Deligiannakis", title = "Sketch-based querying of distributed sliding-window data streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "10", pages = "992--1003", month = jun, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:13 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "While traditional data-management systems focus on evaluating single, ad-hoc queries over static data sets in a centralized setting, several emerging applications require (possibly, continuous) answers to queries on dynamic data that is widely distributed and constantly updated. Furthermore, such query answers often need to discount data that is ``stale'', and operate solely on a sliding window of recent data arrivals (e.g., data updates occurring over the last 24 hours). Such distributed data streaming applications mandate novel algorithmic solutions that are both time- and space-efficient (to manage high-speed data streams), and also communication-efficient (to deal with physical data distribution). In this paper, we consider the problem of complex query answering over distributed, high-dimensional data streams in the sliding-window model. We introduce a novel sketching technique (termed ECM-sketch) that allows effective summarization of streaming data over both time-based and count-based sliding windows with probabilistic accuracy guarantees. Our sketch structure enables point as well as inner-product queries, and can be employed to address a broad range of problems, such as maintaining frequency statistics, finding heavy hitters, and computing quantiles in the sliding-window model. Focusing on distributed environments, we demonstrate how ECM-sketches of individual, local streams can be composed to generate a (low-error) ECM-sketch summary of the order-preserving aggregation of all streams; furthermore, we show how ECM-sketches can be exploited for continuous monitoring of sliding-window queries over distributed streams. Our extensive experimental study with two real-life data sets validates our theoretical claims and verifies the effectiveness of our techniques. To the best of our knowledge, ours is the first work to address efficient, guaranteed-error complex query answering over distributed data streams in the sliding-window model.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Vo:2012:LSL, author = "Hoang Tam Vo and Sheng Wang and Divyakant Agrawal and Gang Chen and Beng Chin Ooi", title = "{LogBase}: a scalable log-structured database system in the cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "10", pages = "1004--1015", month = jun, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:13 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Numerous applications such as financial transactions (e.g., stock trading) are write-heavy in nature. The shift from reads to writes in web applications has also been accelerating in recent years. Write-ahead-logging is a common approach for providing recovery capability while improving performance in most storage systems. However, the separation of log and application data incurs write overheads observed in write-heavy environments and hence adversely affects the write throughput and recovery time in the system. In this paper, we introduce LogBase --- a scalable log-structured database system that adopts log-only storage for removing the write bottleneck and supporting fast system recovery. It is designed to be dynamically deployed on commodity clusters to take advantage of elastic scaling property of cloud environments. LogBase provides in-memory multiversion indexes for supporting efficient access to data maintained in the log. LogBase also supports transactions that bundle read and write operations spanning across multiple records. We implemented the proposed system and compared it with HBase and a disk-based log-structured record-oriented system modeled after RAMCloud. The experimental results show that LogBase is able to provide sustained write throughput, efficient data access out of the cache, and effective system recovery.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lu:2012:EPN, author = "Wei Lu and Yanyan Shen and Su Chen and Beng Chin Ooi", title = "Efficient processing of $k$ nearest neighbor joins using {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "10", pages = "1016--1027", month = jun, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:13 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "k nearest neighbor join ($k$ NN join), designed to find $k$ nearest neighbors from a dataset S for every object in another dataset R, is a primitive operation widely adopted by many data mining applications. As a combination of the $k$ nearest neighbor query and the join operation, $k$ NN join is an expensive operation. Given the increasing volume of data, it is difficult to perform a $k$ NN join on a centralized machine efficiently. In this paper, we investigate how to perform $k$ NN join using MapReduce which is a well-accepted framework for data-intensive applications over clusters of computers. In brief, the mappers cluster objects into groups; the reducers perform the $k$ NN join on each group of objects separately. We design an effective mapping mechanism that exploits pruning rules for distance filtering, and hence reduces both the shuffling and computational costs. To reduce the shuffling cost, we propose two approximate algorithms to minimize the number of replicas. Extensive experiments on our in-house cluster demonstrate that our proposed methods are efficient, robust and scalable.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Laptev:2012:EAR, author = "Nikolay Laptev and Kai Zeng and Carlo Zaniolo", title = "Early accurate results for advanced analytics on {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "10", pages = "1028--1039", month = jun, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:13 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Approximate results based on samples often provide the only way in which advanced analytical applications on very massive data sets can satisfy their time and resource constraints. Unfortunately, methods and tools for the computation of accurate early results are currently not supported in MapReduce-oriented systems although these are intended for 'big data'. Therefore, we proposed and implemented a non-parametric extension of Hadoop which allows the incremental computation of early results for arbitrary work-flows, along with reliable on-line estimates of the degree of accuracy achieved so far in the computation. These estimates are based on a technique called bootstrapping that has been widely employed in statistics and can be applied to arbitrary functions and data distributions. In this paper, we describe our Early Accurate Result Library (EARL) for Hadoop that was designed to minimize the changes required to the MapReduce framework. Various tests of EARL of Hadoop are presented to characterize the frequent situations where EARL can provide major speed-ups over the current version of Hadoop.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2012:CCD, author = "Xuan Liu and Meiyu Lu and Beng Chin Ooi and Yanyan Shen and Sai Wu and Meihui Zhang", title = "{CDAS}: a crowdsourcing data analytics system", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "10", pages = "1040--1051", month = jun, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:13 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Some complex problems, such as image tagging and natural language processing, are very challenging for computers, where even state-of-the-art technology is yet able to provide satisfactory accuracy. Therefore, rather than relying solely on developing new and better algorithms to handle such tasks, we look to the crowdsourcing solution --- employing human participation --- to make good the shortfall in current technology. Crowdsourcing is a good supplement to many computer tasks. A complex job may be divided into computer-oriented tasks and human-oriented tasks, which are then assigned to machines and humans respectively. To leverage the power of crowdsourcing, we design and implement a Crowdsourcing Data Analytics System, CDAS. CDAS is a framework designed to support the deployment of various crowdsourcing applications. The core part of CDAS is a quality-sensitive answering model, which guides the crowdsourcing engine to process and monitor the human tasks. In this paper, we introduce the principles of our quality-sensitive model. To satisfy user required accuracy, the model guides the crowdsourcing query engine for the design and processing of the corresponding crowdsourcing jobs. It provides an estimated accuracy for each generated result based on the human workers' historical performances. When verifying the quality of the result, the model employs an online strategy to reduce waiting time. To show the effectiveness of the model, we implement and deploy two analytics jobs on CDAS, a twitter sentiment analytics job and an image tagging job. We use real Twitter and Flickr data as our queries respectively. We compare our approaches with state-of-the-art classification and image annotation techniques. The results show that the human-assisted methods can indeed achieve a much higher accuracy. By embedding the quality-sensitive model into crowdsourcing query engine, we effectively reduce the processing cost while maintaining the required query answer quality.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sachan:2012:MSS, author = "Mayank Sachan and Arnab Bhattacharya", title = "Mining statistically significant substrings using the chi-square statistic", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "10", pages = "1052--1063", month = jun, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:13 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The problem of identification of statistically significant patterns in a sequence of data has been applied to many domains such as intrusion detection systems, financial models, web-click records, automated monitoring systems, computational biology, cryptology, and text analysis. An observed pattern of events is deemed to be statistically significant if it is unlikely to have occurred due to randomness or chance alone. We use the chi-square statistic as a quantitative measure of statistical significance. Given a string of characters generated from a memoryless Bernoulli model, the problem is to identify the substring for which the empirical distribution of single letters deviates the most from the distribution expected from the generative Bernoulli model. This deviation is captured using the chi-square measure. The most significant substring (MSS) of a string is thus defined as the substring having the highest chi-square value. Till date, to the best of our knowledge, there does not exist any algorithm to find the MSS in better than $ O(n^2) $ time, where $n$ denotes the length of the string. In this paper, we propose an algorithm to find the most significant substring, whose running time is $ O(n^{3 / 2})$ with high probability. We also study some variants of this problem such as finding the top-$t$ set, finding all substrings having chi-square greater than a fixed threshold and finding the MSS among substrings greater than a given length. We experimentally demonstrate the asymptotic behavior of the MSS on varying the string size and alphabet size. We also describe some applications of our algorithm on cryptology and real world data from finance and sports. Finally, we compare our technique with the existing heuristics for finding the MSS.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Albutiu:2012:MPS, author = "Martina-Cezara Albutiu and Alfons Kemper and Thomas Neumann", title = "Massively parallel sort-merge joins in main memory multi-core database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "10", pages = "1064--1075", month = jun, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:13 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Two emerging hardware trends will dominate the database system technology in the near future: increasing main memory capacities of several TB per server and massively parallel multi-core processing. Many algorithmic and control techniques in current database technology were devised for disk-based systems where I/O dominated the performance. In this work we take a new look at the well-known sort-merge join which, so far, has not been in the focus of research in scalable massively parallel multi-core data processing as it was deemed inferior to hash joins. We devise a suite of new massively parallel sort-merge (MPSM) join algorithms that are based on partial partition-based sorting. Contrary to classical sort-merge joins, our MPSM algorithms do not rely on a hard to parallelize final merge step to create one complete sort order. Rather they work on the independently created runs in parallel. This way our MPSM algorithms are NUMA-affine as all the sorting is carried out on local memory partitions. An extensive experimental evaluation on a modern 32-core machine with one TB of main memory proves the competitive performance of MPSM on large main memory databases with billions of objects. It scales (almost) linearly in the number of employed cores and clearly outperforms competing hash join proposals --- in particular it outperforms the ``cutting-edge'' Vectorwise parallel query engine by a factor of four.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Luo:2012:HDH, author = "Tian Luo and Rubao Lee and Michael Mesnier and Feng Chen and Xiaodong Zhang", title = "{hStorage-DB}: heterogeneity-aware data management to exploit the full capability of hybrid storage systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "10", pages = "1076--1087", month = jun, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:13 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As storage systems become increasingly heterogeneous and complex, it adds burdens on DBAs, causing suboptimal performance even after a lot of human efforts have been made. In addition, existing monitoring-based storage management by access pattern detections has difficulties to handle workloads that are highly dynamic and concurrent. To achieve high performance by best utilizing heterogeneous storage devices, we have designed and implemented a heterogeneity-aware software framework for DBMS storage management called hStorage-DB, where semantic information that is critical for storage I/O is identified and passed to the storage manager. According to the collected semantic information, requests are classified into different types. Each type is assigned a proper QoS policy supported by the underlying storage system, so that every request will be served with a suitable storage device. With hStorage-DB, we can well utilize semantic information that cannot be detected through data access monitoring but is particularly important for a hybrid storage system. To show the effectiveness of hStorage-DB, we have implemented a system prototype that consists of an I/O request classification enabled DBMS, and a hybrid storage system that is organized into a two-level caching hierarchy. Our performance evaluation shows that hStorage-DB can automatically make proper decisions for data allocation in different storage devices and make substantial performance improvements in a cost-efficient way.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Choi:2012:SAM, author = "Dong-Wan Choi and Chin-Wan Chung and Yufei Tao", title = "A scalable algorithm for maximizing range sum in spatial databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1088--1099", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper investigates the MaxRS problem in spatial databases. Given a set O of weighted points and a rectangular region r of a given size, the goal of the MaxRS problem is to find a location of r such that the sum of the weights of all the points covered by r is maximized. This problem is useful in many location-based applications such as finding the best place for a new franchise store with a limited delivery range and finding the most attractive place for a tourist with a limited reachable range. However, the problem has been studied mainly in theory, particularly, in computational geometry. The existing algorithms from the computational geometry community are in-memory algorithms which do not guarantee the scalability. In this paper, we propose a scalable external-memory algorithm (ExactMaxRS) for the MaxRS problem, which is optimal in terms of the I/O complexity. Furthermore, we propose an approximation algorithm (ApproxMaxCRS) for the MaxCRS problem that is a circle version of the MaxRS problem. We prove the correctness and optimality of the ExactMaxRS algorithm along with the approximation bound of the ApproxMaxCRS algorithm. From extensive experimental results, we show that the ExactMaxRS algorithm is two orders of magnitude faster than methods adapted from existing algorithms, and the approximation bound in practice is much better than the theoretical bound of the ApproxMaxCRS algorithm.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Aly:2012:SQT, author = "Ahmed M. Aly and Walid G. Aref and Mourad Ouzzani", title = "Spatial queries with two {kNN} predicates", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1100--1111", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The widespread use of location-aware devices has led to countless location-based services in which a user query can be arbitrarily complex, i.e., one that embeds multiple spatial selection and join predicates. Amongst these predicates, the $k$-Nearest-Neighbor ($k$ NN) predicate stands as one of the most important and widely used predicates. Unlike related research, this paper goes beyond the optimization of queries with single $k$ NN predicates, and shows how queries with two $k$ NN predicates can be optimized. In particular, the paper addresses the optimization of queries with: (i) two $k$ NN-select predicates, (ii) two $k$ NN-join predicates, and (iii) one $k$ NN-join predicate and one $k$ NN-select predicate. For each type of queries, conceptually correct query evaluation plans (QEPs) and new algorithms that optimize the query execution time are presented. Experimental results demonstrate that the proposed algorithms outperform the conceptually correct QEPs by orders of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sheng:2012:OAC, author = "Cheng Sheng and Nan Zhang and Yufei Tao and Xin Jin", title = "Optimal algorithms for crawling a hidden database in the web", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1112--1123", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A hidden database refers to a dataset that an organization makes accessible on the web by allowing users to issue queries through a search interface. In other words, data acquisition from such a source is not by following static hyper-links. Instead, data are obtained by querying the interface, and reading the result page dynamically generated. This, with other facts such as the interface may answer a query only partially, has prevented hidden databases from being crawled effectively by existing search engines. This paper remedies the problem by giving algorithms to extract all the tuples from a hidden database. Our algorithms are provably efficient, namely, they accomplish the task by performing only a small number of queries, even in the worst case. We also establish theoretical results indicating that these algorithms are asymptotically optimal --- i.e., it is impossible to improve their efficiency by more than a constant factor. The derivation of our upper and lower bound results reveals significant insight into the characteristics of the underlying problem. Extensive experiments confirm the proposed techniques work very well on all the real datasets examined.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qin:2012:DTR, author = "Lu Qin and Jeffrey Xu Yu and Lijun Chang", title = "Diversifying top-$k$ results", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1124--1135", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Top-$k$ query processing finds a list of $k$ results that have largest scores w.r.t the user given query, with the assumption that all the $k$ results are independent to each other. In practice, some of the top-$k$ results returned can be very similar to each other. As a result some of the top-$k$ results returned are redundant. In the literature, diversified top-$k$ search has been studied to return $k$ results that take both score and diversity into consideration. Most existing solutions on diversified top-$k$ search assume that scores of all the search results are given, and some works solve the diversity problem on a specific problem and can hardly be extended to general cases. In this paper, we study the diversified top-$k$ search problem. We define a general diversified top-$k$ search problem that only considers the similarity of the search results themselves. We propose a framework, such that most existing solutions for top-$k$ query processing can be extended easily to handle diversified top-$k$ search, by simply applying three new functions, a sufficient stop condition sufficient(), a necessary stop condition necessary(), and an algorithm for diversified top-$k$ search on the current set of generated results, div-search-current(). We propose three new algorithms, namely, div-astar, div-dp, and div-cut to solve the div-search-current() problem. div-astar is an A* based algorithm, div-dp is an algorithm that decomposes the results into components which are searched using div-astar independently and combined using dynamic programming. div-cut further decomposes the current set of generated results using cut points and combines the results using sophisticated operations. We conducted extensive performance studies using two real datasets, enwiki and reuters. Our div-cut algorithm finds the optimal solution for diversified top-$k$ search problem in seconds even for $k$ as large as 2, 000.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2012:KAO, author = "Xin Cao and Lisi Chen and Gao Cong and Xiaokui Xiao", title = "Keyword-aware optimal route search", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1136--1147", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Identifying a preferable route is an important problem that finds applications in map services. When a user plans a trip within a city, the user may want to find ``a most popular route such that it passes by shopping mall, restaurant, and pub, and the travel time to and from his hotel is within 4 hours.'' However, none of the algorithms in the existing work on route planning can be used to answer such queries. Motivated by this, we define the problem of keyword-aware optimal route query, denoted by KOR, which is to find an optimal route such that it covers a set of user-specified keywords, a specified budget constraint is satisfied, and an objective score of the route is optimal. The problem of answering KOR queries is NP-hard. We devise an approximation algorithm OSScaling with provable approximation bounds. Based on this algorithm, another more efficient approximation algorithm BucketBound is proposed. We also design a greedy approximation algorithm. Results of empirical studies show that all the proposed algorithms are capable of answering KOR queries efficiently, while the BucketBound and Greedy algorithms run faster. The empirical studies also offer insight into the accuracy of the proposed algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cautis:2012:AQU, author = "Bogdan Cautis and Evgeny Kharlamov", title = "Answering queries using views over probabilistic {XML}: complexity and tractability", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1148--1159", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the complexity of query answering using views in a probabilistic XML setting, identifying large classes of XPath queries --- with child and descendant navigation and predicates --- for which there are efficient (PTime) algorithms. We consider this problem under the two possible semantics for XML query results: with persistent node identifiers and in their absence. Accordingly, we consider rewritings that can exploit a single view, by means of compensation, and rewritings that can use multiple views, by means of intersection. Since in a probabilistic setting queries return answers with probabilities, the problem of rewriting goes beyond the classic one of retrieving XML answers from views. For both semantics of XML queries, we show that, even when XML answers can be retrieved from views, their probabilities may not be computable. For rewritings that use only compensation, we describe a PTime decision procedure, based on easily verifiable criteria that distinguish between the feasible cases --- when probabilistic XML results are computable --- and the unfeasible ones. For rewritings that can use multiple views, with compensation and intersection, we identify the most permissive conditions that make probabilistic rewriting feasible, and we describe an algorithm that is sound in general, and becomes complete under fairly permissive restrictions, running in PTime modulo worst-case exponential time equivalence tests. This is the best we can hope for since intersection makes query equivalence intractable already over deterministic data. Our algorithm runs in PTime whenever deterministic rewritings can be found in PTime.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jha:2012:PDM, author = "Abhay Jha and Dan Suciu", title = "Probabilistic databases with {MarkoViews}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1160--1171", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Most of the work on query evaluation in probabilistic databases has focused on the simple tuple-independent data model, where tuples are independent random events. Several efficient query evaluation techniques exists in this setting, such as safe plans, algorithms based on OBDDs, tree-decomposition and a variety of approximation algorithms. However, complex data analytics tasks often require complex correlations, and query evaluation then is significantly more expensive, or more restrictive. In this paper, we propose MVDB as a framework both for representing complex correlations and for efficient query evaluation. An MVDB specifies correlations by views, called MarkoViews, on the probabilistic relations and declaring the weights of the view's outputs. An MVDB is a (very large) Markov Logic Network. We make two sets of contributions. First, we show that query evaluation on an MVDB is equivalent to evaluating a Union of Conjunctive Query(UCQ) over a tuple-independent database. The translation is exact (thus allowing the techniques developed for tuple independent databases to be carried over to MVDB), yet it is novel and quite non-obvious (some resulting probabilities may be negative!). This translation in itself though may not lead to much gain since the translated query gets complicated as we try to capture more correlations. Our second contribution is to propose a new query evaluation strategy that exploits offline compilation to speed up online query evaluation. Here we utilize and extend our prior work on compilation of UCQ. We validate experimentally our techniques on a large probabilistic database with MarkoViews inferred from the DBLP data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mamouras:2012:CSC, author = "Konstantinos Mamouras and Sigal Oren and Lior Seeman and Lucja Kot and Johannes Gehrke", title = "The complexity of social coordination", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1172--1183", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Coordination is a challenging everyday task; just think of the last time you organized a party or a meeting involving several people. As a growing part of our social and professional life goes online, an opportunity for an improved coordination process arises. Recently, Gupta et al. proposed entangled queries as a declarative abstraction for data-driven coordination, where the difficulty of the coordination task is shifted from the user to the database. Unfortunately, evaluating entangled queries is very hard, and thus previous work considered only a restricted class of queries that satisfy safety (the coordination partners are fixed) and uniqueness (all queries need to be satisfied). In this paper we significantly extend the class of feasible entangled queries beyond uniqueness and safety. First, we show that we can simply drop uniqueness and still efficiently evaluate a set of safe entangled queries. Second, we show that as long as all users coordinate on the same set of attributes, we can give an efficient algorithm for coordination even if the set of queries does not satisfy safety. In an experimental evaluation we show that our algorithms are feasible for a wide spectrum of coordination scenarios.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2012:EMW, author = "Xiaofei Zhang and Lei Chen and Min Wang", title = "Efficient multi-way theta-join processing using {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1184--1195", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Multi-way Theta-join queries are powerful in describing complex relations and therefore widely employed in real practices. However, existing solutions from traditional distributed and parallel databases for multi-way Theta-join queries cannot be easily extended to fit a shared-nothing distributed computing paradigm, which is proven to be able to support OLAP applications over immense data volumes. In this work, we study the problem of efficient processing of multi-way Theta-join queries using MapReduce from a cost-effective perspective. Although there have been some works using the (key, value) pair-based programming model to support join operations, efficient processing of multi-way Theta-join queries has never been fully explored. The substantial challenge lies in, given a number of processing units (that can run Map or Reduce tasks), mapping a multi-way Theta-join query to a number of MapReduce jobs and having them executed in a well scheduled sequence, such that the total processing time span is minimized. Our solution mainly includes two parts: (1) cost metrics for both single MapReduce job and a number of MapReduce jobs executed in a certain order; (2) the efficient execution of a chain-typed Theta-join with only one MapReduce job. Comparing with the query evaluation strategy proposed in [23] and the widely adopted Pig Latin and Hive SQL solutions, our method achieves significant improvement of the join processing efficiency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lim:2012:STB, author = "Harold Lim and Herodotos Herodotou and Shivnath Babu", title = "{Stubby}: a transformation-based optimizer for {MapReduce} workflows", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1196--1207", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "There is a growing trend of performing analysis on large datasets using workflows composed of MapReduce jobs connected through producer-consumer relationships based on data. This trend has spurred the development of a number of interfaces---ranging from program-based to query-based interfaces---for generating MapReduce workflows. Studies have shown that the gap in performance can be quite large between optimized and unoptimized workflows. However, automatic cost-based optimization of MapReduce workflows remains a challenge due to the multitude of interfaces, large size of the execution plan space, and the frequent unavailability of all types of information needed for optimization. We introduce a comprehensive plan space for MapReduce workflows generated by popular workflow generators. We then propose Stubby, a cost-based optimizer that searches selectively through the subspace of the full plan space that can be enumerated correctly and costed based on the information available in any given setting. Stubby enumerates the plan space based on plan-to-plan transformations and an efficient search algorithm. Stubby is designed to be extensible to new interfaces and new types of optimizations, which is a desirable feature given how rapidly MapReduce systems are evolving. Stubby's efficiency and effectiveness have been evaluated using representative workflows from many domains.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bao:2012:LWV, author = "Zhuowei Bao and Susan B. Davidson and Tova Milo", title = "Labeling workflow views with fine-grained dependencies", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1208--1219", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper considers the problem of efficiently answering reachability queries over views of provenance graphs, derived from executions of workflows that may include recursion. Such views include composite modules and model fine-grained dependencies between module inputs and outputs. A novel view-adaptive dynamic labeling scheme is developed for efficient query evaluation, in which view specifications are labeled statically (i.e. as they are created) and data items are labeled dynamically as they are produced during a workflow execution. Although the combination of fine-grained dependencies and recursive workflows entail, in general, long (linear-size) data labels, we show that for a large natural class of workflows and views, labels are compact (logarithmic-size) and reachability queries can be evaluated in constant time. Experimental results demonstrate the benefit of this approach over the state-of-the-art technique when applied for labeling multiple views.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Szlichta:2012:FOD, author = "Jaros{\l}aw Szlichta and Parke Godfrey and Jarek Gryz", title = "Fundamentals of order dependencies", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1220--1231", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Dependencies have played a significant role in database design for many years. They have also been shown to be useful in query optimization. In this paper, we discuss dependencies between lexicographically ordered sets of tuples. We introduce formally the concept of order dependency and present a set of axioms (inference rules) for them. We show how query rewrites based on these axioms can be used for query optimization. We present several interesting theorems that can be derived using the inference rules. We prove that functional dependencies are subsumed by order dependencies and that our set of axioms for order dependencies is sound and complete.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bakibayev:2012:FQE, author = "Nurzhan Bakibayev and Dan Olteanu and Jakub Z{\'a}vodn{\'y}", title = "{FDB}: a query engine for factorised relational databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1232--1243", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Factorised databases are relational databases that use compact factorised representations at the physical layer to reduce data redundancy and boost query performance. This paper introduces FDB, an in-memory query engine for select-project-join queries on factorised databases. Key components of FDB are novel algorithms for query optimisation and evaluation that exploit the succinctness brought by data factorisation. Experiments show that for data sets with many-to-many relationships FDB can outperform relational engines by orders of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2012:OAW, author = "Yu Cao and Chee-Yong Chan and Jie Li and Kian-Lee Tan", title = "Optimization of analytic window functions", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1244--1255", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Analytic functions represent the state-of-the-art way of performing complex data analysis within a single SQL statement. In particular, an important class of analytic functions that has been frequently used in commercial systems to support OLAP and decision support applications is the class of window functions. A window function returns for each input tuple a value derived from applying a function over a window of neighboring tuples. However, existing window function evaluation approaches are based on a naive sorting scheme. In this paper, we study the problem of optimizing the evaluation of window functions. We propose several efficient techniques, and identify optimization opportunities that allow us to optimize the evaluation of a set of window functions. We have integrated our scheme into PostgreSQL. Our comprehensive experimental study on the TPC-DS datasets as well as synthetic datasets and queries demonstrate significant speedup over existing approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hueske:2012:OBB, author = "Fabian Hueske and Mathias Peters and Matthias J. Sax and Astrid Rheinl{\"a}nder and Rico Bergmann and Aljoscha Krettek and Kostas Tzoumas", title = "Opening the black boxes in data flow optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1256--1267", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many systems for big data analytics employ a data flow abstraction to define parallel data processing tasks. In this setting, custom operations expressed as user-defined functions are very common. We address the problem of performing data flow optimization at this level of abstraction, where the semantics of operators are not known. Traditionally, query optimization is applied to queries with known algebraic semantics. In this work, we find that a handful of properties, rather than a full algebraic specification, suffice to establish reordering conditions for data processing operators. We show that these properties can be accurately estimated for black box operators by statically analyzing the general-purpose code of their user-defined functions. We design and implement an optimizer for parallel data flows that does not assume knowledge of semantics or algebraic properties of operators. Our evaluation confirms that the optimizer can apply common rewritings such as selection reordering, bushy join-order enumeration, and limited forms of aggregation push-down, hence yielding similar rewriting power as modern relational DBMS optimizers. Moreover, it can optimize the operator order of nonrelational data flows, a unique feature among today's systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ewen:2012:SFI, author = "Stephan Ewen and Kostas Tzoumas and Moritz Kaufmann and Volker Markl", title = "Spinning fast iterative data flows", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1268--1279", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Parallel dataflow systems are a central part of most analytic pipelines for big data. The iterative nature of many analysis and machine learning algorithms, however, is still a challenge for current systems. While certain types of bulk iterative algorithms are supported by novel dataflow frameworks, these systems cannot exploit computational dependencies present in many algorithms, such as graph algorithms. As a result, these algorithms are inefficiently executed and have led to specialized systems based on other paradigms, such as message passing or shared memory. We propose a method to integrate incremental iterations, a form of workset iterations, with parallel dataflows. After showing how to integrate bulk iterations into a dataflow system and its optimizer, we present an extension to the programming model for incremental iterations. The extension alleviates for the lack of mutable state in dataflows and allows for exploiting the sparse computational dependencies inherent in many iterative algorithms. The evaluation of a prototypical implementation shows that those aspects lead to up to two orders of magnitude speedup in algorithm runtime, when exploited. In our experiments, the improved dataflow system is highly competitive with specialized systems while maintaining a transparent and unified dataflow abstraction.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mihaylov:2012:RRD, author = "Svilen R. Mihaylov and Zachary G. Ives and Sudipto Guha", title = "{REX}: recursive, delta-based data-centric computation", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1280--1291", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In today's Web and social network environments, query workloads include ad hoc and OLAP queries, as well as iterative algorithms that analyze data relationships (e.g., link analysis, clustering, learning). Modern DBMSs support ad hoc and OLAP queries, but most are not robust enough to scale to large clusters. Conversely, ``cloud'' platforms like MapReduce execute chains of batch tasks across clusters in a fault tolerant way, but have too much overhead to support ad hoc queries. Moreover, both classes of platform incur significant overhead in executing iterative data analysis algorithms. Most such iterative algorithms repeatedly refine portions of their answers, until some convergence criterion is reached. However, general cloud platforms typically must reprocess all data in each step. DBMSs that support recursive SQL are more efficient in that they propagate only the changes in each step --- but they still accumulate each iteration's state, even if it is no longer useful. User-defined functions are also typically harder to write for DBMSs than for cloud platforms. We seek to unify the strengths of both styles of platforms, with a focus on supporting iterative computations in which changes, in the form of deltas, are propagated from iteration to iteration, and state is efficiently updated in an extensible way. We present a programming model oriented around deltas, describe how we execute and optimize such programs in our REX runtime system, and validate that our platform also handles failures gracefully. We experimentally validate our techniques, and show speedups over the competing methods ranging from 2.5 to nearly 100 times.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cheng:2012:KRW, author = "James Cheng and Zechao Shang and Hong Cheng and Haixun Wang and Jeffrey Xu Yu", title = "{K}-reach: who is in your small world", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1292--1303", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the problem of answering $k$-hop reachability queries in a directed graph, i.e., whether there exists a directed path of length k, from a source query vertex to a target query vertex in the input graph. The problem of $k$-hop reachability is a general problem of the classic reachability (where $ k = \infty $). Existing indexes for processing classic reachability queries, as well as for processing shortest path queries, are not applicable or not efficient for processing $k$-hop reachability queries. We propose an index for processing $k$-hop reachability queries, which is simple in design and efficient to construct. Our experimental results on a wide range of real datasets show that our index is more efficient than the state-of-the-art indexes even for processing classic reachability queries, for which these indexes are primarily designed. We also show that our index is efficient in answering $k$-hop reachability queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2012:PGD, author = "Wenfei Fan and Xin Wang and Yinghui Wu", title = "Performance guarantees for distributed reachability queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1304--1316", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the real world a graph is often fragmented and distributed across different sites. This highlights the need for evaluating queries on distributed graphs. This paper proposes distributed evaluation algorithms for three classes of queries: reachability for determining whether one node can reach another, bounded reachability for deciding whether there exists a path of a bounded length between a pair of nodes, and regular reachability for checking whether there exists a path connecting two nodes such that the node labels on the path form a string in a given regular expression. We develop these algorithms based on partial evaluation, to explore parallel computation. When evaluating a query Q on a distributed graph G, we show that these algorithms possess the following performance guarantees, no matter how G is fragmented and distributed: (1) each site is visited only once; (2) the total network traffic is determined by the size of Q and the fragmentation of G, independent of the size of G; and (3) the response time is decided by the largest fragment of G rather than the entire G. In addition, we show that these algorithms can be readily implemented in the MapReduce framework. Using synthetic and real-life data, we experimentally verify that these algorithms are scalable on large graphs, regardless of how the graphs are distributed.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chubak:2012:EIQ, author = "Pirooz Chubak and Davood Rafiei", title = "Efficient indexing and querying over syntactically annotated trees", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1316--1327", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Natural language text corpora are often available as sets of syntactically parsed trees. A wide range of expressive tree queries are possible over such parsed trees that open a new avenue in searching over natural language text. They not only allow for querying roles and relationships within sentences, but also improve search effectiveness compared to flat keyword queries. One major drawback of current systems supporting querying over parsed text is the performance of evaluating queries over large data. In this paper we propose a novel indexing scheme over unique subtrees as index keys. We also propose a novel root-split coding scheme that stores subtree structural information only partially, thus reducing index size and improving querying performance. Our extensive set of experiments show that root-split coding reduces the index size of any interval coding which stores individual node numbers by a factor of 50\% to 80\%, depending on the sizes of subtrees indexed. Moreover, We show that our index using root-split coding, outperforms previous approaches by at least an order of magnitude in terms of the response time of queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Barany:2012:QGN, author = "Vince B{\'a}r{\'a}ny and Balder ten Cate and Martin Otto", title = "Queries with guarded negation", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1328--1339", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A well-established and fundamental insight in database theory is that negation (also known as complementation) tends to make queries difficult to process and difficult to reason about. Many basic problems are decidable and admit practical algorithms in the case of unions of conjunctive queries, but become difficult or even undecidable when queries are allowed to contain negation. Inspired by recent results in finite model theory, we consider a restricted form of negation, guarded negation. We introduce a fragment of SQL, called GN-SQL, as well as a fragment of Datalog with stratified negation, called GN-Datalog, that allow only guarded negation, and we show that these query languages are computationally well behaved, in terms of testing query containment, query evaluation, open-world query answering, and boundedness. GN-SQL and GN-Datalog subsume a number of well known query languages and constraint languages, such as unions of conjunctive queries, monadic Datalog, and frontier-guarded tgds. In addition, an analysis of standard benchmark workloads shows that many uses of negation in SQL in practice are guarded.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2012:PFI, author = "Ninghui Li and Wahbeh Qardaji and Dong Su and Jianneng Cao", title = "{PrivBasis}: frequent itemset mining with differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1340--1351", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The discovery of frequent itemsets can serve valuable economic and research purposes. Releasing discovered frequent itemsets, however, presents privacy challenges. In this paper, we study the problem of how to perform frequent itemset mining on transaction databases while satisfying differential privacy. We propose an approach, called PrivBasis, which leverages a novel notion called basis sets. A $ \theta $-basis set has the property that any itemset with frequency higher than $ \theta $ is a subset of some basis. We introduce algorithms for privately constructing a basis set and then using it to find the most frequent itemsets. Experiments show that our approach greatly outperforms the current state of the art.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yuan:2012:LRM, author = "Ganzhao Yuan and Zhenjie Zhang and Marianne Winslett and Xiaokui Xiao and Yin Yang and Zhifeng Hao", title = "Low-rank mechanism: optimizing batch queries under differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1352--1363", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Differential privacy is a promising privacy-preserving paradigm for statistical query processing over sensitive data. It works by injecting random noise into each query result, such that it is provably hard for the adversary to infer the presence or absence of any individual record from the published noisy results. The main objective in differentially private query processing is to maximize the accuracy of the query results, while satisfying the privacy guarantees. Previous work, notably the matrix mechanism [16], has suggested that processing a batch of correlated queries as a whole can potentially achieve considerable accuracy gains, compared to answering them individually. However, as we point out in this paper, the matrix mechanism is mainly of theoretical interest; in particular, several inherent problems in its design limit its accuracy in practice, which almost never exceeds that of na{\"\i}ve methods. In fact, we are not aware of any existing solution that can effectively optimize a query batch under differential privacy. Motivated by this, we propose the Low-Rank Mechanism (LRM), the first practical differentially private technique for answering batch queries with high accuracy, based on a low rank approximation of the workload matrix. We prove that the accuracy provided by LRM is close to the theoretical lower bound for any mechanism to answer a batch of queries under differential privacy. Extensive experiments using real data demonstrate that LRM consistently outperforms state-of-the-art query processing solutions under differential privacy, by large margins.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2012:FMR, author = "Jun Zhang and Zhenjie Zhang and Xiaokui Xiao and Yin Yang and Marianne Winslett", title = "Functional mechanism: regression analysis under differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1364--1375", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "$ \epsilon $-differential privacy is the state-of-the-art model for releasing sensitive information while protecting privacy. Numerous methods have been proposed to enforce $ \epsilon $-differential privacy in various analytical tasks, e.g., regression analysis. Existing solutions for regression analysis, however, are either limited to non-standard types of regression or unable to produce accurate regression results. Motivated by this, we propose the Functional Mechanism, a differentially private method designed for a large class of optimization-based analyses. The main idea is to enforce $ \epsilon $-differential privacy by perturbing the objective function of the optimization problem, rather than its results. As case studies, we apply the functional mechanism to address two most widely used regression models, namely, linear regression and logistic regression. Both theoretical analysis and thorough experimental evaluations show that the functional mechanism is highly effective and efficient, and it significantly outperforms existing solutions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Boldi:2012:IUG, author = "Paolo Boldi and Francesco Bonchi and Aristides Gionis and Tamir Tassa", title = "Injecting uncertainty in graphs for identity obfuscation", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1376--1387", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data collected nowadays by social-networking applications create fascinating opportunities for building novel services, as well as expanding our understanding about social structures and their dynamics. Unfortunately, publishing social-network graphs is considered an ill-advised practice due to privacy concerns. To alleviate this problem, several anonymization methods have been proposed, aiming at reducing the risk of a privacy breach on the published data, while still allowing to analyze them and draw relevant conclusions. In this paper we introduce a new anonymization approach that is based on injecting uncertainty in social graphs and publishing the resulting uncertain graphs. While existing approaches obfuscate graph data by adding or removing edges entirely, we propose using a finer-grained perturbation that adds or removes edges partially: this way we can achieve the same desired level of obfuscation with smaller changes in the data, thus maintaining higher utility. Our experiments on real-world networks confirm that at the same level of identity obfuscation our method provides higher usefulness than existing randomized methods that publish standard graphs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2012:PMR, author = "Jianneng Cao and Panagiotis Karras", title = "Publishing microdata with a robust privacy guarantee", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1388--1399", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Today, the publication of microdata poses a privacy threat. Vast research has striven to define the privacy condition that microdata should satisfy before it is released, and devise algorithms to anonymize the data so as to achieve this condition. Yet, no method proposed to date explicitly bounds the percentage of information an adversary gains after seeing the published data for each sensitive value therein. This paper introduces $ \beta $-likeness, an appropriately robust privacy model for microdata anonymization, along with two anonymization schemes designed therefore, the one based on generalization, and the other based on perturbation. Our model postulates that an adversary's confidence on the likelihood of a certain sensitive-attribute (SA) value should not increase, in relative difference terms, by more than a predefined threshold. Our techniques aim to satisfy a given $ \beta $ threshold with little information loss. We experimentally demonstrate that (i) our model provides an effective privacy guarantee in a way that predecessor models cannot, (ii) our generalization scheme is more effective and efficient in its task than methods adapting algorithms for the $k$-anonymity model, and (iii) our perturbation method outperforms a baseline approach. Moreover, we discuss in detail the resistance of our model and methods to attacks proposed in previous research.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Guan:2012:MTE, author = "Ziyu Guan and Xifeng Yan and Lance M. Kaplan", title = "Measuring two-event structural correlations on graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1400--1411", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Real-life graphs usually have various kinds of events happening on them, e.g., product purchases in online social networks and intrusion alerts in computer networks. The occurrences of events on the same graph could be correlated, exhibiting either attraction or repulsion. Such structural correlations can reveal important relationships between different events. Unfortunately, correlation relationships on graph structures are not well studied and cannot be captured by traditional measures. In this work, we design a novel measure for assessing two-event structural correlations on graphs. Given the occurrences of two events, we choose uniformly a sample of ``reference nodes'' from the vicinity of all event nodes and employ the Kendall's $ \tau $ rank correlation measure to compute the average concordance of event density changes. Significance can be efficiently assessed by $ \tau $'s nice property of being asymptotically normal under the null hypothesis. In order to compute the measure in large scale networks, we develop a scalable framework using different sampling strategies. The complexity of these strategies is analyzed. Experiments on real graph datasets with both synthetic and real events demonstrate that the proposed framework is not only efficacious, but also efficient and scalable.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jestes:2012:RLT, author = "Jeffrey Jestes and Jeff M. Phillips and Feifei Li and Mingwang Tang", title = "Ranking large temporal data", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1412--1423", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Ranking temporal data has not been studied until recently, even though ranking is an important operator (being promoted as a first-class citizen) in database systems. However, only the instant top-$k$ queries on temporal data were studied in, where objects with the $k$ highest scores at a query time instance t are to be retrieved. The instant top-$k$ definition clearly comes with limitations (sensitive to outliers, difficult to choose a meaningful query time $t$). A more flexible and general ranking operation is to rank objects based on the aggregation of their scores in a query interval, which we dub the aggregate top-$k$ query on temporal data. For example, return the top-10 weather stations having the highest average temperature from 10/01/2010 to 10/07/2010; find the top-20 stocks having the largest total transaction volumes from 02/05/2011 to 02/07/2011. This work presents a comprehensive study to this problem by designing both exact and approximate methods (with approximation quality guarantees). We also provide theoretical analysis on the construction cost, the index size, the update and the query costs of each approach. Extensive experiments on large real datasets clearly demonstrate the efficiency, the effectiveness, and the scalability of our methods compared to the baseline methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Funke:2012:CTD, author = "Florian Funke and Alfons Kemper and Thomas Neumann", title = "Compacting transactional data in hybrid {OLTP\&OLAP} databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1424--1435", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Growing main memory sizes have facilitated database management systems that keep the entire database in main memory. The drastic performance improvements that came along with these in-memory systems have made it possible to reunite the two areas of online transaction processing (OLTP) and online analytical processing (OLAP): An emerging class of hybrid OLTP and OLAP database systems allows to process analytical queries directly on the transactional data. By offering arbitrarily current snapshots of the transactional data for OLAP, these systems enable real-time business intelligence. Despite memory sizes of several Terabytes in a single commodity server, RAM is still a precious resource: Since free memory can be used for intermediate results in query processing, the amount of memory determines query performance to a large extent. Consequently, we propose the compaction of memory-resident databases. Compaction consists of two tasks: First, separating the mutable working set from the immutable ``frozen'' data. Second, compressing the immutable data and optimizing it for efficient, memory-consumption-friendly snapshotting. Our approach reorganizes and compresses transactional data online and yet hardly affects the mission-critical OLTP throughput. This is achieved by unburdening the OLTP threads from all additional processing and performing these tasks asynchronously.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hall:2012:PTC, author = "Alexander Hall and Olaf Bachmann and Robert B{\"u}ssow and Silviu Ganceanu and Marc Nunkesser", title = "Processing a trillion cells per mouse click", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1436--1446", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Column-oriented database systems have been a real game changer for the industry in recent years. Highly tuned and performant systems have evolved that provide users with the possibility of answering ad hoc queries over large datasets in an interactive manner. In this paper we present the column-oriented datastore developed as one of the central components of PowerDrill. It combines the advantages of columnar data layout with other known techniques (such as using composite range partitions) and extensive algorithmic engineering on key data structures. The main goal of the latter being to reduce the main memory footprint and to increase the efficiency in processing typical user queries. In this combination we achieve large speed-ups. These enable a highly interactive Web UI where it is common that a single mouse click leads to processing a trillion values in the underlying dataset.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Porobic:2012:OHI, author = "Danica Porobic and Ippokratis Pandis and Miguel Branco and Pinar T{\"o}z{\"u}n and Anastasia Ailamaki", title = "{OLTP} on hardware islands", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1447--1458", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern hardware is abundantly parallel and increasingly heterogeneous. The numerous processing cores have nonuniform access latencies to the main memory and to the processor caches, which causes variability in the communication costs. Unfortunately, database systems mostly assume that all processing cores are the same and that microarchitecture differences are not significant enough to appear in critical database execution paths. As we demonstrate in this paper, however, hardware heterogeneity does appear in the critical path and conventional database architectures achieve suboptimal and even worse, unpredictable performance. We perform a detailed performance analysis of OLTP deployments in servers with multiple cores per CPU (multicore) and multiple CPUs per server (multisocket). We compare different database deployment strategies where we vary the number and size of independent database instances running on a single server, from a single shared-everything instance to fine-grained shared-nothing configurations. We quantify the impact of non-uniform hardware on various deployments by (a) examining how efficiently each deployment uses the available hardware resources and (b) measuring the impact of distributed transactions and skewed requests on different workloads. Finally, we argue in favor of shared-nothing deployments that are topology- and workload-aware and take advantage of fast on-chip communication between islands of cores on the same socket.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Patterson:2012:SSC, author = "Stacy Patterson and Aaron J. Elmore and Faisal Nawab and Divyakant Agrawal and Amr {El Abbadi}", title = "Serializability, not serial: concurrency control and availability in multi-datacenter datastores", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1459--1470", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present a framework for concurrency control and availability in multi-datacenter datastores. While we consider Google's Megastore as our motivating example, we define general abstractions for key components, making our solution extensible to any system that satisfies the abstraction properties. We first develop and analyze a transaction management and replication protocol based on a straightforward implementation of the Paxos algorithm. Our investigation reveals that this protocol acts as a concurrency prevention mechanism rather than a concurrency control mechanism. We then propose an enhanced protocol called Paxos with Combination and Promotion (Paxos-CP) that provides true transaction concurrency while requiring the same per instance message complexity as the basic Paxos protocol. Finally, we compare the performance of Paxos and Paxos-CP in a multi-datacenter experimental study, and we demonstrate that Paxos-CP results in significantly fewer aborted transactions than basic Paxos.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cheung:2012:APD, author = "Alvin Cheung and Samuel Madden and Owen Arden and Andrew C. Myers", title = "Automatic partitioning of database applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1471--1482", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database-backed applications are nearly ubiquitous in our daily lives. Applications that make many small accesses to the database create two challenges for developers: increased latency and wasted resources from numerous network round trips. A well-known technique to improve transactional database application performance is to convert part of the application into stored procedures that are executed on the database server. Unfortunately, this conversion is often difficult. In this paper we describe Pyxis, a system that takes database-backed applications and automatically partitions their code into two pieces, one of which is executed on the application server and the other on the database server. Pyxis profiles the application and server loads, statically analyzes the code's dependencies, and produces a partitioning that minimizes the number of control transfers as well as the amount of data sent during each transfer. Our experiments using TPC-C and TPC-W show that Pyxis is able to generate partitions with up to 3x reduction in latency and 1.7x improvement in throughput when compared to a traditional non-partitioned implementation and has comparable performance to that of a custom stored procedure implementation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2012:CCE, author = "Jiannan Wang and Tim Kraska and Michael J. Franklin and Jianhua Feng", title = "{CrowdER}: crowdsourcing entity resolution", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1483--1494", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Entity resolution is central to data integration and data cleaning. Algorithmic approaches have been improving in quality, but remain far from perfect. Crowdsourcing platforms offer a more accurate but expensive (and slow) way to bring human insight into the process. Previous work has proposed batching verification tasks for presentation to human workers but even with batching, a human-only approach is infeasible for data sets of even moderate size, due to the large numbers of matches to be tested. Instead, we propose a hybrid human-machine approach in which machines are used to do an initial, coarse pass over all the data, and people are used to verify only the most likely matching pairs. We show that for such a hybrid system, generating the minimum number of verification tasks of a given size is NP-Hard, but we develop a novel two-tiered heuristic approach for creating batched tasks. We describe this method, and present the results of extensive experiments on real data sets using a popular crowdsourcing platform. The experiments show that our hybrid approach achieves both good efficiency and high accuracy compared to machine-only or human-only alternatives.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2012:WAJ, author = "Caleb Chen Cao and Jieying She and Yongxin Tong and Lei Chen", title = "Whom to ask?: jury selection for decision making tasks on micro-blog services", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1495--1506", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "It is universal to see people obtain knowledge on micro-blog services by asking others decision making questions. In this paper, we study the Jury Selection Problem(JSP) by utilizing crowdsourcing for decision making tasks on micro-blog services. Specifically, the problem is to enroll a subset of crowd under a limited budget, whose aggregated wisdom via Majority Voting scheme has the lowest probability of drawing a wrong answer(Jury Error Rate-JER). Due to various individual error-rates of the crowd, the calculation of JER is non-trivial. Firstly, we explicitly state that JER is the probability when the number of wrong jurors is larger than half of the size of a jury. To avoid the exponentially increasing calculation of JER, we propose two efficient algorithms and an effective bounding technique. Furthermore, we study the Jury Selection Problem on two crowdsourcing models, one is for altruistic users(AltrM) and the other is for incentive-requiring users(PayM) who require extra payment when enrolled into a task. For the AltrM model, we prove the monotonicity of JER on individual error rate and propose an efficient exact algorithm for JSP. For the PayM model, we prove the NP-hardness of JSP on PayM and propose an efficient greedy-based heuristic algorithm. Finally, we conduct a series of experiments to investigate the traits of JSP, and validate the efficiency and effectiveness of our proposed algorithms on both synthetic and real micro-blog data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2012:AAL, author = "Xiaochun Yang and Honglei Liu and Bin Wang", title = "{ALAE}: accelerating local alignment with affine gap exactly in biosequence databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1507--1518", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the problem of local alignment, which is finding pairs of similar subsequences with gaps. The problem exists in biosequence databases. BLAST is a typical software for finding local alignment based on heuristic, but could miss results. Using the Smith-Waterman algorithm, we can find all local alignments in $ O(m n) $ time, where $m$ and $n$ are lengths of a query and a text, respectively. A recent exact approach BWT-SW improves the complexity of the Smith-Waterman algorithm under constraints, but still much slower than BLAST. This paper takes on the challenge of designing an accurate and efficient algorithm for evaluating local-alignment searches, especially for long queries. In this paper, we propose an efficient software called ALAE to speed up BWT-SW using a compressed suffix array. ALAE utilizes a family of filtering techniques to prune meaningless calculations and an algorithm for reusing score calculations. We also give a mathematical analysis and show that the upper bound of the total number of calculated entries using ALAE could vary from 4.50 mn$^{0.520}$ to 9.05 mn$^{0.896}$ for random DNA sequences and vary from 8.28 mn$^{0.364}$ to 7.49 mn$^{0.723}$ for random protein sequences. We demonstrate the significant performance improvement of ALAE on BWT-SW using a thorough experimental study on real biosequences. ALAE guarantees correctness and accelerates BLAST for most of parameters.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Candan:2012:SCD, author = "K. Sel{\c{c}}uk Candan and Rosaria Rossini and Xiaolan Wang and Maria Luisa Sapino", title = "{sDTW}: computing {DTW} distances using locally relevant constraints based on salient feature alignments", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1519--1530", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many applications generate and consume temporal data and retrieval of time series is a key processing step in many application domains. Dynamic time warping (DTW) distance between time series of size N and M is computed relying on a dynamic programming approach which creates and fills an N x M grid to search for an optimal warp path. Since this can be costly, various heuristics have been proposed to cut away the potentially unproductive portions of the DTW grid. In this paper, we argue that time series often carry structural features that can be used for identifying locally relevant constraints to eliminate redundant work. Relying on this observation, we propose salient feature based sDTW algorithms which first identify robust salient features in the given time series and then find a consistent alignment of these to establish the boundaries for the warp path search. More specifically, we propose alternative fixed core\&adaptive width, adaptive core\&fixed width, and adaptive core\&adaptive width strategies which enforce different constraints reflecting the high level structural characteristics of the series in the data set. Experiment results show that the proposed sDTW algorithms help achieve much higher accuracy in DTW computation and time series retrieval than fixed core \& fixed width algorithms that do not leverage local features of the given time series.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tauheed:2012:SPL, author = "Farhan Tauheed and Thomas Heinis and Felix Sch{\"u}rmann and Henry Markram and Anastasia Ailamaki", title = "{SCOUT}: prefetching for latent structure following queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1531--1542", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Today's scientists are quickly moving from in vitro to in silico experimentation: they no longer analyze natural phenomena in a petri dish, but instead they build models and simulate them. Managing and analyzing the massive amounts of data involved in simulations is a major task. Yet, they lack the tools to efficiently work with data of this size. One problem many scientists share is the analysis of the massive spatial models they build. For several types of analysis they need to interactively follow the structures in the spatial model, e.g., the arterial tree, neuron fibers, etc., and issue range queries along the way. Each query takes long to execute, and the total time for executing a sequence of queries significantly delays data analysis. Prefetching the spatial data reduces the response time considerably, but known approaches do not prefetch with high accuracy. We develop SCOUT, a structure-aware method for prefetching data along interactive spatial query sequences. SCOUT uses an approximate graph model of the structures involved in past queries and attempts to identify what particular structure the user follows. Our experiments with neuro-science data show that SCOUT prefetches with an accuracy from 71\% to 92\%, which translates to a speedup of 4x-15x. SCOUT also improves the prefetching accuracy on datasets from other scientific domains, such as medicine and biology.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2012:API, author = "Kaibo Wang and Yin Huai and Rubao Lee and Fusheng Wang and Xiaodong Zhang and Joel H. Saltz", title = "Accelerating pathology image data cross-comparison on {CPU--GPU} hybrid systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1543--1554", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As an important application of spatial databases in pathology imaging analysis, cross-comparing the spatial boundaries of a huge amount of segmented micro-anatomic objects demands extremely data- and compute-intensive operations, requiring high throughput at an affordable cost. However, the performance of spatial database systems has not been satisfactory since their implementations of spatial operations cannot fully utilize the power of modern parallel hardware. In this paper, we provide a customized software solution that exploits GPUs and multi-core CPUs to accelerate spatial cross-comparison in a cost-effective way. Our solution consists of an efficient GPU algorithm and a pipelined system framework with task migration support. Extensive experiments with real-world data sets demonstrate the effectiveness of our solution, which improves the performance of spatial cross-comparison by over 18 times compared with a parallelized spatial database approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2012:RER, author = "Jiexing Li and Arnd Christian K{\"o}nig and Vivek Narasayya and Surajit Chaudhuri", title = "Robust estimation of resource consumption for {SQL} queries using statistical techniques", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1555--1566", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The ability to estimate resource consumption of SQL queries is crucial for a number of tasks in a database system such as admission control, query scheduling and costing during query optimization. Recent work has explored the use of statistical techniques for resource estimation in place of the manually constructed cost models used in query optimization. Such techniques, which require as training data examples of resource usage in queries, offer the promise of superior estimation accuracy since they can account for factors such as hardware characteristics of the system or bias in cardinality estimates. However, the proposed approaches lack robustness in that they do not generalize well to queries that are different from the training examples, resulting in significant estimation errors. Our approach aims to address this problem by combining knowledge of database query processing with statistical models. We model resource-usage at the level of individual operators, with different models and features for each operator type, and explicitly model the asymptotic behavior of each operator. This results in significantly better estimation accuracy and the ability to estimate resource usage of arbitrary plans, even when they are very different from the training instances. We validate our approach using various large scale real-life and benchmark workloads on Microsoft SQL Server.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Das:2012:WTW, author = "Mahashweta Das and Saravanan Thirumuruganathan and Sihem Amer-Yahia and Gautam Das and Cong Yu", title = "Who tags what?: an analysis framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1567--1578", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The rise of Web 2.0 is signaled by sites such as Flickr, del.icio.us, and YouTube, and social tagging is essential to their success. A typical tagging action involves three components, user, item (e.g., photos in Flickr), and tags (i.e., words or phrases). Analyzing how tags are assigned by certain users to certain items has important implications in helping users search for desired information. In this paper, we explore common analysis tasks and propose a dual mining framework for social tagging behavior mining. This framework is centered around two opposing measures, similarity and diversity, being applied to one or more tagging components, and therefore enables a wide range of analysis scenarios such as characterizing similar users tagging diverse items with similar tags, or diverse users tagging similar items with diverse tags, etc. By adopting different concrete measures for similarity and diversity in the framework, we show that a wide range of concrete analysis problems can be defined and they are NP-Complete in general. We design efficient algorithms for solving many of those problems and demonstrate, through comprehensive experiments over real data, that our algorithms significantly out-perform the exact brute-force approach without compromising analysis result quality.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhu:2012:GFE, author = "Haohan Zhu and George Kollios and Vassilis Athitsos", title = "A generic framework for efficient and effective subsequence retrieval", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1579--1590", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper proposes a general framework for matching similar subsequences in both time series and string databases. The matching results are pairs of query subsequences and database subsequences. The framework finds all possible pairs of similar subsequences if the distance measure satisfies the ``consistency'' property, which is a property introduced in this paper. We show that most popular distance functions, such as the Euclidean distance, DTW, ERP, the Frech{\'e}t distance for time series, and the Hamming distance and Levenshtein distance for strings, are all ``consistent''. We also propose a generic index structure for metric spaces named ``reference net''. The reference net occupies $ O(n) $ space, where $n$ is the size of the dataset and is optimized to work well with our framework. The experiments demonstrate the ability of our method to improve retrieval performance when combined with diverse distance measures. The experiments also illustrate that the reference net scales well in terms of space overhead and query time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dittrich:2012:OAE, author = "Jens Dittrich and Jorge-Arnulfo Quian{\'e}-Ruiz and Stefan Richter and Stefan Schuh and Alekh Jindal and J{\"o}rg Schad", title = "Only aggressive elephants are fast elephants", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1591--1602", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Yellow elephants are slow. A major reason is that they consume their inputs entirely before responding to an elephant rider's orders. Some clever riders have trained their yellow elephants to only consume parts of the inputs before responding. However, the teaching time to make an elephant do that is high. So high that the teaching lessons often do not pay off. We take a different approach. We make elephants aggressive; only this will make them very fast. We propose HAIL (Hadoop Aggressive Indexing Library), an enhancement of HDFS and Hadoop MapReduce that dramatically improves runtimes of several classes of MapReduce jobs. HAIL changes the upload pipeline of HDFS in order to create different clustered indexes on each data block replica. An interesting feature of HAIL is that we typically create a win-win situation: we improve both data upload to HDFS and the runtime of the actual Hadoop MapReduce job. In terms of data upload, HAIL improves over HDFS by up to 60\% with the default replication factor of three. In terms of query execution, we demonstrate that HAIL runs up to 68x faster than Hadoop. In our experiments, we use six clusters including physical and EC2 clusters of up to 100 nodes. A series of scalability experiments also demonstrates the superiority of HAIL.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2012:MLP, author = "Rui Li and Shengjie Wang and Kevin Chen-Chuan Chang", title = "Multiple location profiling for users and relationships from social network and content", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1603--1614", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Users' locations are important for many applications such as personalized search and localized content delivery. In this paper, we study the problem of profiling Twitter users' locations with their following network and tweets. We propose a multiple location profiling model (MLP), which has three key features: (1) it formally models how likely a user follows another user given their locations and how likely a user tweets a venue given his location, (2) it fundamentally captures that a user has multiple locations and his following relationships and tweeted venues can be related to any of his locations, and some of them are even noisy, and (3) it novelly utilizes the home locations of some users as partial supervision. As a result, MLP not only discovers users' locations accurately and completely, but also ``explains'' each following relationship by revealing users' true locations in the relationship. Experiments on a large-scale data set demonstrate those advantages. Particularly, (1) for predicting users' home locations, MLP successfully places 62\% users and out-performs two state-of-the-art methods by 10\% in accuracy, (2) for discovering users' multiple locations, MLP improves the baseline methods by 14\% in recall, and (3) for explaining following relationships, MLP achieves 57\% accuracy.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kang:2012:FBE, author = "Woon-Hak Kang and Sang-Won Lee and Bongki Moon", title = "Flash-based extended cache for higher throughput and faster recovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1615--1626", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Considering the current price gap between disk and flash memory drives, for applications dealing with large scale data, it will be economically more sensible to use flash memory drives to supplement disk drives rather than to replace them. This paper presents FaCE, which is a new low-overhead caching strategy that uses flash memory as an extension to the DRAM buffer. FaCE aims at improving the transaction throughput as well as shortening the recovery time from a system failure. To achieve the goals, we propose two novel algorithms for flash cache management, namely, Multi-Version FIFO replacement and Group Second Chance. One striking result from FaCE is that using a small flash memory drive as a caching device could deliver even higher throughput than using a large flash memory drive to store the entire database tables. This was possible due to flash write optimization as well as disk access reduction obtained by the FaCE caching methods. In addition, FaCE takes advantage of the non-volatility of flash memory to fully support database recovery by extending the scope of a persistent database to include the data pages stored in the flash cache. We have implemented FaCE in the PostgreSQL open source database server and demonstrated its effectiveness for TPC-C benchmarks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bender:2012:DTH, author = "Michael A. Bender and Martin Farach-Colton and Rob Johnson and Russell Kraner and Bradley C. Kuszmaul and Dzejla Medjedovic and Pablo Montes and Pradeep Shetty and Richard P. Spillane and Erez Zadok", title = "Don't thrash: how to cache your hash on flash", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1627--1637", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents new alternatives to the well-known Bloom filter data structure. The Bloom filter, a compact data structure supporting set insertion and membership queries, has found wide application in databases, storage systems, and networks. Because the Bloom filter performs frequent random reads and writes, it is used almost exclusively in RAM, limiting the size of the sets it can represent. This paper first describes the quotient filter, which supports the basic operations of the Bloom filter, achieving roughly comparable performance in terms of space and time, but with better data locality. Operations on the quotient filter require only a small number of contiguous accesses. The quotient filter has other advantages over the Bloom filter: it supports deletions, it can be dynamically resized, and two quotient filters can be efficiently merged. The paper then gives two data structures, the buffered quotient filter and the cascade filter, which exploit the quotient filter advantages and thus serve as SSD-optimized alternatives to the Bloom filter. The cascade filter has better asymptotic I/O performance than the buffered quotient filter, but the buffered quotient filter outperforms the cascade filter on small to medium data sets. Both data structures significantly outperform recently-proposed SSD-optimized Bloom filter variants, such as the elevator Bloom filter, buffered Bloom filter, and forest-structured Bloom filter. In experiments, the cascade filter and buffered quotient filter performed insertions 8.6--11 times faster than the fastest Bloom filter variant and performed lookups 0.94--2.56 times faster.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Isele:2012:LEL, author = "Robert Isele and Christian Bizer", title = "Learning expressive linkage rules using genetic programming", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1638--1649", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A central problem in data integration and data cleansing is to find entities in different data sources that describe the same real-world object. Many existing methods for identifying such entities rely on explicit linkage rules which specify the conditions that entities must fulfill in order to be considered to describe the same real-world object. In this paper, we present the GenLink algorithm for learning expressive linkage rules from a set of existing reference links using genetic programming. The algorithm is capable of generating linkage rules which select discriminative properties for comparison, apply chains of data transformations to normalize property values, choose appropriate distance measures and thresholds and combine the results of multiple comparisons using non-linear aggregation functions. Our experiments show that the GenLink algorithm outperforms the state-of-the-art genetic programming approach to learning linkage rules recently presented by Carvalho et. al. and is capable of learning linkage rules which achieve a similar accuracy as human written rules for the same problem.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tong:2012:MFI, author = "Yongxin Tong and Lei Chen and Yurong Cheng and Philip S. Yu", title = "Mining frequent itemsets over uncertain databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1650--1661", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In recent years, due to the wide applications of uncertain data, mining frequent itemsets over uncertain databases has attracted much attention. In uncertain databases, the support of an itemset is a random variable instead of a fixed occurrence counting of this itemset. Thus, unlike the corresponding problem in deterministic databases where the frequent itemset has a unique definition, the frequent itemset under uncertain environments has two different definitions so far. The first definition, referred as the expected support-based frequent itemset, employs the expectation of the support of an itemset to measure whether this itemset is frequent. The second definition, referred as the probabilistic frequent itemset, uses the probability of the support of an itemset to measure its frequency. Thus, existing work on mining frequent itemsets over uncertain databases is divided into two different groups and no study is conducted to comprehensively compare the two different definitions. In addition, since no uniform experimental platform exists, current solutions for the same definition even generate inconsistent results. In this paper, we firstly aim to clarify the relationship between the two different definitions. Through extensive experiments, we verify that the two definitions have a tight connection and can be unified together when the size of data is large enough. Secondly, we provide baseline implementations of eight existing representative algorithms and test their performances with uniform measures fairly. Finally, according to the fair tests over many different benchmark data sets, we clarify several existing inconsistent conclusions and discuss some new findings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dallachiesa:2012:UTS, author = "Michele Dallachiesa and Besmira Nushi and Katsiaryna Mirylenka and Themis Palpanas", title = "Uncertain time-series similarity: return to the basics", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1662--1673", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the last years there has been a considerable increase in the availability of continuous sensor measurements in a wide range of application domains, such as Location-Based Services (LBS), medical monitoring systems, manufacturing plants and engineering facilities to ensure efficiency, product quality and safety, hydrologic and geologic observing systems, pollution management, and others. Due to the inherent imprecision of sensor observations, many investigations have recently turned into querying, mining and storing uncertain data. Uncertainty can also be due to data aggregation, privacy-preserving transforms, and error-prone mining algorithms. In this study, we survey the techniques that have been proposed specifically for modeling and processing uncertain time series, an important model for temporal data. We provide an analytical evaluation of the alternatives that have been proposed in the literature, highlighting the advantages and disadvantages of each approach, and further compare these alternatives with two additional techniques that were carefully studied before. We conduct an extensive experimental evaluation with 17 real datasets, and discuss some surprising results, which suggest that a fruitful research direction is to take into account the temporal correlations in the time series. Based on our evaluations, we also provide guidelines useful for the practitioners in the field.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dasu:2012:SDC, author = "Tamraparni Dasu and Ji Meng Loh", title = "Statistical distortion: consequences of data cleaning", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1674--1683", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We introduce the notion of statistical distortion as an essential metric for measuring the effectiveness of data cleaning strategies. We use this metric to propose a widely applicable yet scalable experimental framework for evaluating data cleaning strategies along three dimensions: glitch improvement, statistical distortion and cost-related criteria. Existing metrics focus on glitch improvement and cost, but not on the statistical impact of data cleaning strategies. We illustrate our framework on real world data, with a comprehensive suite of experiments and analyses.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lang:2012:TEE, author = "Willis Lang and Stavros Harizopoulos and Jignesh M. Patel and Mehul A. Shah and Dimitris Tsirogiannis", title = "Towards energy-efficient database cluster design", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "11", pages = "1684--1695", month = jul, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:15 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Energy is a growing component of the operational cost for many ``big data'' deployments, and hence has become increasingly important for practitioners of large-scale data analysis who require scale-out clusters or parallel DBMS appliances. Although a number of recent studies have investigated the energy efficiency of DBMSs, none of these studies have looked at the architectural design space of energy-efficient parallel DBMS clusters. There are many challenges to increasing the energy efficiency of a DBMS cluster, including dealing with the inherent scaling inefficiency of parallel data processing, and choosing the appropriate energy-efficient hardware. In this paper, we experimentally examine and analyze a number of key parameters related to these challenges for designing energy-efficient database clusters. We explore the cluster design space using empirical results and propose a model that considers the key bottlenecks to energy efficiency in a parallel DBMS. This paper represents a key first step in designing energy-efficient database clusters, which is increasingly important given the trend toward parallel database appliances.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jensen:2012:DMS, author = "Christian S. Jensen", title = "Data management on the spatial web", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1696--1696", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Due in part to the increasing mobile use of the web and the proliferation of geo-positioning, the web is fast acquiring a significant spatial aspect. Content and users are being augmented with locations that are used increasingly by location-based services. Studies suggest that each week, several billion web queries are issued that have local intent and target spatial web objects. These are points of interest with a web presence, and they thus have locations as well as textual descriptions. This development has given prominence to spatial web data management, an area ripe with new and exciting opportunities and challenges. The research community has embarked on inventing and supporting new query functionality for the spatial web. Different kinds of spatial web queries return objects that are near a location argument and are relevant to a text argument. To support such queries, it is important to be able to rank objects according to their relevance to a query. And it is important to be able to process the queries with low latency. The talk offers an overview of key aspects of the spatial web. Based on recent results obtained by the speaker and his colleagues, the talk explores new query functionality enabled by the setting. Further, the talk offers insight into the data management techniques capable of supporting such functionality.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dietrich:2012:DAO, author = "Brenda Dietrich", title = "Data analytics opportunities in a smarter planet", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1697--1697", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "New applications of computing are being enabled by instrumentation of physical entities, aggregation of data, and the analysis of the data. The resulting integration of information and control permits efficient and effective management of complex man-made systems. Examples include transportation systems, buildings, electrical grids, health care systems, governments, and supply chains. Achieving this vision requires extensive data integration and analysis, over diverse, rapidly changing, and often uncertain data. There are many challenges, requiring both new data management techniques as well as new mathematics, forcing new collaborations as the basis of the new ``Data Science''. Needs and opportunities will be discussed in the context of specific pilots and projects.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sahin:2012:CEM, author = "Kenan Sahin", title = "Challenges in economic massive content storage and management ({MCSAM}) in the era of self-organizing, self-expanding and self-linking data clusters", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1698--1698", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Rapid spread of social networks, global on-line shopping, post 9/11 security oriented linking of data bases and foremost the global adoption of smart phones/devices, among other phenomena, are transforming data clusters into dynamic and almost uncontrollable entities that have their own local intelligence, clients and objectives. The scale and rapidity of change is such that large scale innovations in content storage and management are urgently needed if the diseconomies of scale and complexity are to be mitigated. The field needs to reinvent itself. Istanbul, a city that has reinvented itself many times is an excellent venue to engage in such a discussion and for me to offer suggestions and proposals that derive from personal experiences that span academia, start ups, R\&D firms and Bell Labs as well my early years spent in Istanbul.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Manku:2012:AFC, author = "Gurmeet Singh Manku and Rajeev Motwani", title = "Approximate frequency counts over data streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1699--1699", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Research in data stream algorithms has blossomed since late 90s. The talk will trace the history of the Approximate Frequency Counts paper, how it was conceptualized and how it influenced data stream research. The talk will also touch upon a recent development: analysis of personal data streams for improving our quality of lives.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hellerstein:2012:MAL, author = "Joseph M. Hellerstein and Christoper R{\'e} and Florian Schoppmann and Daisy Zhe Wang and Eugene Fratkin and Aleksander Gorajek and Kee Siong Ng and Caleb Welton and Xixuan Feng and Kun Li and Arun Kumar", title = "The {MADlib} analytics library: or {MAD} skills, the {SQL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1700--1711", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "MADlib is a free, open-source library of in-database analytic methods. It provides an evolving suite of SQL-based algorithms for machine learning, data mining and statistics that run at scale within a database engine, with no need for data import/export to other tools. The goal is for MADlib to eventually serve a role for scalable database systems that is similar to the CRAN library for R: a community repository of statistical methods, this time written with scale and parallelism in mind. In this paper we introduce the MADlib project, including the background that led to its beginnings, and the motivation for its open-source nature. We provide an overview of the library's architecture and design patterns, and provide a description of various statistical methods in that context. We include performance and speedup results of a core design pattern from one of those methods over the Greenplum parallel DBMS on a modest-sized test cluster. We then report on two initial efforts at incorporating academic research into MADlib, which is one of the project's goals. MADlib is freely available at http://madlib.net, and the project is open for contributions of both new methods, and ports to additional database platforms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Floratou:2012:CEH, author = "Avrilia Floratou and Nikhil Teletia and David J. DeWitt and Jignesh M. Patel and Donghui Zhang", title = "Can the elephants handle the {NoSQL} onslaught?", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1712--1723", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this new era of ``big data'', traditional DBMSs are under attack from two sides. At one end of the spectrum, the use of document store NoSQL systems (e.g. MongoDB) threatens to move modern Web 2.0 applications away from traditional RDBMSs. At the other end of the spectrum, big data DSS analytics that used to be the domain of parallel RDBMSs is now under attack by another class of NoSQL data analytics systems, such as Hive on Hadoop. So, are the traditional RDBMSs, aka ``big elephants'', doomed as they are challenged from both ends of this ``big data'' spectrum? In this paper, we compare one representative NoSQL system from each end of this spectrum with SQL Server, and analyze the performance and scalability aspects of each of these approaches (NoSQL vs. SQL) on two workloads (decision support analysis and interactive data-serving) that represent the two ends of the application spectrum. We present insights from this evaluation and speculate on potential trends for the future.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rabl:2012:SBD, author = "Tilmann Rabl and Sergio G{\'o}mez-Villamor and Mohammad Sadoghi and Victor Munt{\'e}s-Mulero and Hans-Arno Jacobsen and Serge Mankovskii", title = "Solving big data challenges for enterprise application performance management", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1724--1735", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As the complexity of enterprise systems increases, the need for monitoring and analyzing such systems also grows. A number of companies have built sophisticated monitoring tools that go far beyond simple resource utilization reports. For example, based on instrumentation and specialized APIs, it is now possible to monitor single method invocations and trace individual transactions across geographically distributed systems. This high-level of detail enables more precise forms of analysis and prediction but comes at the price of high data rates (i.e., big data). To maximize the benefit of data monitoring, the data has to be stored for an extended period of time for ulterior analysis. This new wave of big data analytics imposes new challenges especially for the application performance monitoring systems. The monitoring data has to be stored in a system that can sustain the high data rates and at the same time enable an up-to-date view of the underlying infrastructure. With the advent of modern key--value stores, a variety of data storage systems have emerged that are built with a focus on scalability and high data rates as predominant in this monitoring use case. In this work, we present our experience and a comprehensive performance evaluation of six modern (open-source) data stores in the context of application performance monitoring as part of CA Technologies initiative. We evaluated these systems with data and workloads that can be found in application performance monitoring, as well as, on-line advertisement, power monitoring, and many other use cases. We present our insights not only as performance results but also as lessons learned and our experience relating to the setup and configuration complexity of these data stores in an industry setting.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shinnar:2012:MIP, author = "Avraham Shinnar and David Cunningham and Vijay Saraswat and Benjamin Herta", title = "{M3R}: increased performance for in-memory {Hadoop} jobs", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1736--1747", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Main Memory Map Reduce (M3R) is a new implementation of the Hadoop Map Reduce (HMR) API targeted at online analytics on high mean-time-to-failure clusters. It does not support resilience, and supports only those workloads which can fit into cluster memory. In return, it can run HMR jobs unchanged --- including jobs produced by compilers for higher-level languages such as Pig, Jaql, and SystemML and interactive front-ends like IBM BigSheets --- while providing significantly better performance than the Hadoop engine on several workloads (e.g. 45x on some input sizes for sparse matrix vector multiply). M3R also supports extensions to the HMR API which can enable Map Reduce jobs to run faster on the M3R engine, while not affecting their performance under the Hadoop engine.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rosch:2012:SAH, author = "Philipp R{\"o}sch and Lars Dannecker and Franz F{\"a}rber and Gregor Hackenbroich", title = "A storage advisor for hybrid-store databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1748--1758", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the SAP HANA database, SAP offers a high-performance in-memory hybrid-store database. Hybrid-store databases---that is, databases supporting row- and column-oriented data management---are getting more and more prominent. While the columnar management offers high-performance capabilities for analyzing large quantities of data, the row-oriented store can handle transactional point queries as well as inserts and updates more efficiently. To effectively take advantage of both stores at the same time the novel question whether to store the given data row- or column-oriented arises. We tackle this problem with a storage advisor tool that supports database administrators at this decision. Our proposed storage advisor recommends the optimal store based on data and query characteristics; its core is a cost model to estimate and compare query execution times for the different stores. Besides a per-table decision, our tool also considers to horizontally and vertically partition the data and manage the partitions on different stores. We evaluated the storage advisor for the use in the SAP HANA database; we show the recommendation quality as well as the benefit of having the data in the optimal store with respect to increased query performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Switakowski:2012:CSP, author = "Micha{\l} {\'S}witakowski and Peter Boncz and Marcin Zukowski", title = "From cooperative scans to predictive buffer management", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1759--1770", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In analytical applications, database systems often need to sustain workloads with multiple concurrent scans hitting the same table. The Cooperative Scans (CScans) framework, which introduces an Active Buffer Manager (ABM) component into the database architecture, has been the most effective and elaborate response to this problem, and was initially developed in the X100 research prototype. We now report on the experiences of integrating Cooperative Scans into its industrial-strength successor, the Vectorwise database product. During this implementation we invented a simpler optimization of concurrent scan buffer management, called Predictive Buffer Management (PBM). PBM is based on the observation that in a workload with long-running scans, the buffer manager has quite a bit of information on the workload in the immediate future, such that an approximation of the ideal OPT algorithm becomes feasible. In the evaluation on both synthetic benchmarks as well as a TPC-H throughput run we compare the benefits of naive buffer management (LRU) versus CScans, PBM and OPT; showing that PBM achieves benefits close to Cooperative Scans, while incurring much lower architectural impact.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lee:2012:ULI, author = "George Lee and Jimmy Lin and Chuang Liu and Andrew Lorek and Dmitriy Ryaboy", title = "The unified logging infrastructure for data analytics at {Twitter}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1771--1780", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In recent years, there has been a substantial amount of work on large-scale data analytics using Hadoop-based platforms running on large clusters of commodity machines. A less-explored topic is how those data, dominated by application logs, are collected and structured to begin with. In this paper, we present Twitter's production logging infrastructure and its evolution from application-specific logging to a unified ``client events'' log format, where messages are captured in common, well-formatted, flexible Thrift messages. Since most analytics tasks consider the user session as the basic unit of analysis, we pre-materialize ``session sequences'', which are compact summaries that can answer a large class of common queries quickly. The development of this infrastructure has streamlined log collection and data analysis, thereby improving our ability to rapidly experiment and iterate on various aspects of the service.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Talius:2012:TLB, author = "Tomas Talius and Robin Dhamankar and Andrei Dumitrache and Hanuma Kodavalla", title = "Transaction log based application error recovery and point in-time query", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1781--1789", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database backups have traditionally been used as the primary mechanism to recover from hardware and user errors. High availability solutions maintain redundant copies of data that can be used to recover from most failures except user or application errors. Database backups are neither space nor time efficient for recovering from user errors which typically occur in the recent past and affect a small portion of the database. Moreover periodic full backups impact user workload and increase storage costs. In this paper we present a scheme that can be used for both user and application error recovery starting from the current state and rewinding the database back in time using the transaction log. While we provide a consistent view of the entire database as of a point in time in the past, the actual prior versions are produced only for data that is accessed. We make the as of data accessible to arbitrary point in time queries by integrating with the database snapshot feature in Microsoft SQL Server.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lamb:2012:VAD, author = "Andrew Lamb and Matt Fuller and Ramakrishna Varadarajan and Nga Tran and Ben Vandiver and Lyric Doshi and Chuck Bear", title = "The {Vertica Analytic Database}: {C-Store} 7 years later", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1790--1801", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper describes the system architecture of the Vertica Analytic Database (Vertica), a commercialization of the design of the C-Store research prototype. Vertica demonstrates a modern commercial RDBMS system that presents a classical relational interface while at the same time achieving the high performance expected from modern ``web scale'' analytic systems by making appropriate architectural choices. Vertica is also an instructive lesson in how academic systems research can be directly commercialized into a successful product.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2012:IAP, author = "Yanpei Chen and Sara Alspaugh and Randy Katz", title = "Interactive analytical processing in big data systems: a cross-industry study of {MapReduce} workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1802--1813", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Within the past few years, organizations in diverse industries have adopted MapReduce-based systems for large-scale data processing. Along with these new users, important new workloads have emerged which feature many small, short, and increasingly interactive jobs in addition to the large, long-running batch jobs for which MapReduce was originally designed. As interactive, large-scale query processing is a strength of the RDBMS community, it is important that lessons from that field be carried over and applied where possible in this new domain. However, these new workloads have not yet been described in the literature. We fill this gap with an empirical analysis of MapReduce traces from six separate business-critical deployments inside Facebook and at Cloudera customers in e-commerce, telecommunications, media, and retail. Our key contribution is a characterization of new MapReduce workloads which are driven in part by interactive analysis, and which make heavy use of query-like programming frameworks on top of MapReduce. These workloads display diverse behaviors which invalidate prior assumptions about MapReduce such as uniform data access, regular diurnal patterns, and prevalence of large jobs. A secondary contribution is a first step towards creating a TPC-like data processing benchmark for MapReduce.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lam:2012:MMS, author = "Wang Lam and Lu Liu and Sts Prasad and Anand Rajaraman and Zoheb Vacheri and AnHai Doan", title = "{Muppet}: {MapReduce}-style processing of fast data", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1814--1825", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "MapReduce has emerged as a popular method to process big data. In the past few years, however, not just big data, but fast data has also exploded in volume and availability. Examples of such data include sensor data streams, the Twitter Firehose, and Facebook updates. Numerous applications must process fast data. Can we provide a MapReduce-style framework so that developers can quickly write such applications and execute them over a cluster of machines, to achieve low latency and high scalability? In this paper we report on our investigation of this question, as carried out at Kosmix and WalmartLabs. We describe MapUpdate, a framework like MapReduce, but specifically developed for fast data. We describe Muppet, our implementation of MapUpdate. Throughout the description we highlight the key challenges, argue why MapReduce is not well suited to address them, and briefly describe our current solutions. Finally, we describe our experience and lessons learned with Muppet, which has been used extensively at Kosmix and WalmartLabs to power a broad range of applications in social media and e-commerce.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jacques-Silva:2012:BUD, author = "Gabriela Jacques-Silva and Bugra Gedik and Rohit Wagle and Kun-Lung Wu and Vibhore Kumar", title = "Building user-defined runtime adaptation routines for stream processing applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1826--1837", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Stream processing applications are deployed as continuous queries that run from the time of their submission until their cancellation. This deployment mode limits developers who need their applications to perform runtime adaptation, such as algorithmic adjustments, incremental job deployment, and application-specific failure recovery. Currently, developers do runtime adaptation by using external scripts and/or by inserting operators into the stream processing graph that are unrelated to the data processing logic. In this paper, we describe a component called orchestrator that allows users to write routines for automatically adapting the application to runtime conditions. Developers build an orchestrator by registering and handling events as well as specifying actuations. Events can be generated due to changes in the system state (e.g., application component failures), built-in system metrics (e.g., throughput of a connection), or custom application metrics (e.g., quality score). Once the orchestrator receives an event, users can take adaptation actions by using the orchestrator actuation APIs. We demonstrate the use of the orchestrator in IBM's System S in the context of three different applications, illustrating application adaptation to changes on the incoming data distribution, to application failures, and on-demand dynamic composition.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jiang:2012:MSP, author = "Junchen Jiang and Hongji Bao and Edward Y. Chang and Yuqian Li", title = "{MOIST}: a scalable and parallel moving object indexer with school tracking", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1838--1849", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Location-Based Service (LBS) is rapidly becoming the next ubiquitous technology for a wide range of mobile applications. To support applications that demand nearest-neighbor and history queries, an LBS spatial indexer must be able to efficiently update, query, archive and mine location records, which can be in contention with each other. In this work, we propose MOIST, whose baseline is a recursive spatial partitioning indexer built upon BigTable. To reduce update and query contention, MOIST groups nearby objects of similar trajectory into the same school, and keeps track of only the history of school leaders. This dynamic clustering scheme can eliminate redundant updates and hence reduce update latency. To improve history query processing, MOIST keeps some history data in memory, while it flushes aged data onto parallel disks in a locality-preserving way. Through experimental studies, we show that MOIST can support highly efficient nearest-neighbor and history queries and can scale well with an increasing number of users and update frequency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ports:2012:SSI, author = "Dan R. K. Ports and Kevin Grittner", title = "Serializable snapshot isolation in {PostgreSQL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1850--1861", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper describes our experience implementing PostgreSQL's new serializable isolation level. It is based on the recently-developed Serializable Snapshot Isolation (SSI) technique. This is the first implementation of SSI in a production database release as well as the first in a database that did not previously have a lock-based serializable isolation level. We reflect on our experience and describe how we overcame some of the resulting challenges, including the implementation of a new lock manager, a technique for ensuring memory usage is bounded, and integration with other PostgreSQL features. We also introduce an extension to SSI that improves performance for read-only transactions. We evaluate PostgreSQL's serializable isolation level using several benchmarks and show that it achieves performance only slightly below that of snapshot isolation, and significantly outperforms the traditional two-phase locking approach on read-intensive workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Murthy:2012:EEU, author = "Karin Murthy and Prasad M. Deshpande and Atreyee Dey and Ramanujam Halasipuram and Mukesh Mohania and P. Deepak and Jennifer Reed and Scott Schumacher", title = "Exploiting evidence from unstructured data to enhance master data management", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1862--1873", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Master data management (MDM) integrates data from multiple structured data sources and builds a consolidated 360-degree view of business entities such as customers and products. Today's MDM systems are not prepared to integrate information from unstructured data sources, such as news reports, emails, call-center transcripts, and chat logs. However, those unstructured data sources may contain valuable information about the same entities known to MDM from the structured data sources. Integrating information from unstructured data into MDM is challenging as textual references to existing MDM entities are often incomplete and imprecise and the additional entity information extracted from text should not impact the trustworthiness of MDM data. In this paper, we present an architecture for making MDM text-aware and showcase its implementation as IBM Info-Sphere MDM Extension for Unstructured Text Correlation, an add-on to IBM InfoSphere Master Data Management Standard Edition. We highlight how MDM benefits from additional evidence found in documents when doing entity resolution and relationship discovery. We experimentally demonstrate the feasibility of integrating information from unstructured data sources into MDM.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2012:AOW, author = "Lili Wu and Roshan Sumbaly and Chris Riccomini and Gordon Koo and Hyung Jin Kim and Jay Kreps and Sam Shah", title = "{Avatara}: {OLAP} for web-scale analytics products", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1874--1877", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Multidimensional data generated by members on websites has seen massive growth in recent years. OLAP is a well-suited solution for mining and analyzing this data. Providing insights derived from this analysis has become crucial for these websites to give members greater value. For example, LinkedIn, the largest professional social network, provides its professional members rich analytics features like ``Who's Viewed My Profile?'' and ``Who's Viewed This Job?'' The data behind these features form cubes that must be efficiently served at scale, and can be neatly sharded to do so. To serve our growing 160 million member base, we built a scalable and fast OLAP serving system called Avatara to solve this many, small cubes problem. At LinkedIn, Avatara has been powering several analytics features on the site for the past two years.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kolb:2012:DED, author = "Lars Kolb and Andreas Thor and Erhard Rahm", title = "{Dedoop}: efficient deduplication with {Hadoop}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1878--1881", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate a powerful and easy-to-use tool called Dedoop (Deduplication with Hadoop) for MapReduce-based entity resolution (ER) of large datasets. Dedoop supports a browser-based specification of complex ER workflows including blocking and matching steps as well as the optional use of machine learning for the automatic generation of match classifiers. Specified workflows are automatically translated into MapReduce jobs for parallel execution on different Hadoop clusters. To achieve high performance Dedoop supports several advanced load balancing strategies.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2012:MBD, author = "Xiufeng Liu and Christian Thomsen and Torben Bach Pedersen", title = "{MapReduce}-based dimensional {ETL} made easy", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1882--1885", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper demonstrates ETLMR, a novel dimensional Extract--Transform--Load (ETL) programming framework that uses Map-Reduce to achieve scalability. ETLMR has built-in native support of data warehouse (DW) specific constructs such as star schemas, snowflake schemas, and slowly changing dimensions (SCDs). This makes it possible to build MapReduce-based dimensional ETL flows very easily. The ETL process can be configured with only few lines of code. We will demonstrate the concrete steps in using ETLMR to load data into a (partly snowflaked) DW schema. This includes configuration of data sources and targets, dimension processing schemes, fact processing, and deployment. In addition, we also present the scalability on large data sets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xu:2012:CIE, author = "Huiqi Xu and Zhen Li and Shumin Guo and Keke Chen", title = "{CloudVista}: interactive and economical visual cluster analysis for big data in the cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1886--1889", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Analysis of big data has become an important problem for many business and scientific applications, among which clustering and visualizing clusters in big data raise some unique challenges. This demonstration presents the CloudVista prototype system to address the problems with big data caused by using existing data reduction approaches. It promotes a whole-big-data visualization approach that preserves the details of clustering structure. The prototype system has several merits. (1) Its visualization model is naturally parallel, which guarantees the scalability. (2) The visual frame structure minimizes the data transferred between the cloud and the client. (3) The RandGen algorithm is used to achieve a good balance between interactivity and batch processing. (4) This approach is also designed to minimize the financial cost of interactive exploration in the cloud. The demonstration will highlight the problems with existing approaches and show the advantages of the CloudVista approach. The viewers will have the chance to play with the CloudVista prototype system and compare the visualization results generated with different approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Alexandrov:2012:MSE, author = "Alexander Alexandrov and Kostas Tzoumas and Volker Markl", title = "{Myriad}: scalable and expressive data generation", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1890--1893", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The current research focus on Big Data systems calls for a rethinking of data generation methods. The traditional sequential data generation approach is not well suited to large-scale systems as generating a terabyte of data may require days or even weeks depending on the number of constraints imposed on the generated model. We demonstrate Myriad, a new data generation toolkit that enables the specification of semantically rich data generator programs that can scale out linearly in a shared-nothing environment. Data generation programs built on top of Myriad implement an efficient parallel execution strategy leveraged by the extensive use of pseudo-random number generators with random access support.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2012:DDC, author = "Eugene Wu and Samuel Madden and Michael Stonebraker", title = "A demonstration of {DBWipes}: clean as you query", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1894--1897", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As data analytics becomes mainstream, and the complexity of the underlying data and computation grows, it will be increasingly important to provide tools that help analysts understand the underlying reasons when they encounter errors in the result. While data provenance has been a large step in providing tools to help debug complex workflows, its current form has limited utility when debugging aggregation operators that compute a single output from a large collection of inputs. Traditional provenance will return the entire input collection, which has very low precision. In contrast, users are seeking precise descriptions of the inputs that caused the errors. We propose a Ranked Provenance System, which identifies subsets of inputs that influenced the output error, describes each subset with human readable predicates and orders them by contribution to the error. In this demonstration, we will present DBWipes, a novel data cleaning system that allows users to execute aggregate queries, and interactively detect, understand, and clean errors in the query results. Conference attendees will explore anomalies in campaign donations from the current US presidential election and in readings from a 54-node sensor deployment.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Alsubaiee:2012:AOS, author = "Sattam Alsubaiee and Yasser Altowim and Hotham Altwaijry and Alexander Behm and Vinayak Borkar and Yingyi Bu and Michael Carey and Raman Grover and Zachary Heilbron and Young-Seok Kim and Chen Li and Nicola Onose and Pouria Pirzadeh and Rares Vernica and Jian Wen", title = "{ASTERIX}: an open source system for {``Big} {Data'}' management and analysis (demo)", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1898--1901", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "At UC Irvine, we are building a next generation parallel database system, called ASTERIX, as our approach to addressing today's ``Big Data'' management challenges. ASTERIX aims to combine time-tested principles from parallel database systems with those of the Web-scale computing community, such as fault tolerance for long running jobs. In this demo, we present a whirlwind tour of ASTERIX, highlighting a few of its key features. We will demonstrate examples of our data definition language to model semi-structured data, and examples of interesting queries using our declarative query language. In particular, we will show the capabilities of ASTERIX for answering geo-spatial queries and fuzzy queries, as well as ASTERIX' data feed construct for continuously ingesting data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Agarwal:2012:BDI, author = "Sameer Agarwal and Anand P. Iyer and Aurojit Panda and Samuel Madden and Barzan Mozafari and Ion Stoica", title = "Blink and it's done: interactive queries on very large data", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1902--1905", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demonstration, we present BlinkDB, a massively parallel, sampling-based approximate query processing framework for running interactive queries on large volumes of data. The key observation in BlinkDB is that one can make reasonable decisions in the absence of perfect answers. BlinkDB extends the Hive/HDFS stack and can handle the same set of SPJA (selection, projection, join and aggregate) queries as supported by these systems. BlinkDB provides real-time answers along with statistical error guarantees, and can scale to petabytes of data and thousands of machines in a fault-tolerant manner. Our experiments using the TPC-H benchmark and on an anonymized real-world video content distribution workload from Conviva Inc. show that BlinkDB can execute a wide range of queries up to 150x faster than Hive on MapReduce and 10--150x faster than Shark (Hive on Spark) over tens of terabytes of data stored across 100 machines, all with an error of 2--10\%.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Roy:2012:MGD, author = "Abhishek Roy and Yanlei Diao and Evan Mauceli and Yiping Shen and Bai-Lin Wu", title = "Massive genomic data processing and deep analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1906--1909", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Today large sequencing centers are producing genomic data at the rate of 10 terabytes a day and require complicated processing to transform massive amounts of noisy raw data into biological information. To address these needs, we develop a system for end-to-end processing of genomic data, including alignment of short read sequences, variation discovery, and deep analysis. We also employ a range of quality control mechanisms to improve data quality and parallel processing techniques for performance. In the demo, we will use real genomic data to show details of data transformation through the workflow, the usefulness of end results (ready for use as testable hypotheses), the effects of our quality control mechanisms and improved algorithms, and finally performance improvement.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liarou:2012:MDO, author = "Erietta Liarou and Stratos Idreos and Stefan Manegold and Martin Kersten", title = "{MonetDB\slash DataCell}: online analytics in a streaming column-store", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1910--1913", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In DataCell, we design streaming functionalities in a modern relational database kernel which targets big data analytics. This includes exploitation of both its storage/execution engine and its optimizer infrastructure. We investigate the opportunities and challenges that arise with such a direction and we show that it carries significant advantages for modern applications in need for online analytics such as web logs, network monitoring and scientific data management. The major challenge then becomes the efficient support for specialized stream features, e.g., multi-query processing and incremental window-based processing as well as exploiting standard DBMS functionalities in a streaming environment such as indexing. This demo presents DataCell, an extension of the MonetDB open-source column-store for online analytics. The demo gives users the opportunity to experience the features of DataCell such as processing both stream and persistent data and performing window based processing. The demo provides a visual interface to monitor the critical system components, e.g., how query plans transform from typical DBMS query plans to online query plans, how data flows through the query plans as the streams evolve, how DataCell maintains intermediate results in columnar form to avoid repeated evaluation of the same stream portions, etc. The demo also provides the ability to interactively set the test scenarios and various DataCell knobs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2012:SSE, author = "Xin Cao and Gao Cong and Christian S. Jensen and Jun Jie Ng and Beng Chin Ooi and Nhan-Tue Phan and Dingming Wu", title = "{SWORS}: a system for the efficient retrieval of relevant spatial web objects", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1914--1917", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Spatial web objects that possess both a geographical location and a textual description are gaining in prevalence. This gives prominence to spatial keyword queries that exploit both location and textual arguments. Such queries are used in many web services such as yellow pages and maps services. We present SWORS, the Spatial Web Object Retrieval System, that is capable of efficiently retrieving spatial web objects that satisfy spatial keyword queries. Specifically, SWORS supports two types of queries: (a) the location-aware top-$k$ text retrieval (L $k$ T) query that retrieves $k$ individual spatial web objects taking into account query location proximity and text relevancy; (b) the spatial keyword group (SKG) query that retrieves a group of objects that cover the query keywords and are nearest to the query location and have the shortest inter-object distances. SWORS provides browser-based interfaces for desktop and laptop computers and provides a client application for mobile devices. The interfaces and the client enable users to formulate queries and view the query results on a map. The server side stores the data and processes the queries. We use three real-life data sets to demonstrate the functionality and performance of SWORS.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Morishima:2012:CCD, author = "Atsuyuki Morishima and Norihide Shinagawa and Tomomi Mitsuishi and Hideto Aoki and Shun Fukusumi", title = "{CyLog\slash Crowd4U}: a declarative platform for complex data-centric crowdsourcing", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1918--1921", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This demo presents a principled approach to the problems of data-centric human/machine computations with Crowd4U, a crowdsourcing platform equipped with a suite of tools for rapid development of crowdsourcing applications. Using the demo, we show that declarative database abstraction can be used as a powerful tool to design, implement, and analyze data-centric crowdsourcing applications. The power of Crowd4U comes from CyLog, a database abstraction that handles complex data-centric human/machine computations. CyLog is a Datalog-like language that incorporates a principled feedback system for humans at the language level so that the semantics of the computation not closed in machines can be defined based on the game theory. We believe that the demo clearly shows that database abstraction can be a promising basis for designing complex data-centric applications requiring human/machine computations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Silva:2012:EDS, author = "Yasin N. Silva and Spencer Pearson", title = "Exploiting database similarity joins for metric spaces", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1922--1925", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Similarity Joins are recognized among the most useful data processing and analysis operations and are extensively used in multiple application domains. They retrieve all data pairs whose distances are smaller than a predefined threshold $ \epsilon $. Multiple Similarity Join algorithms and implementation techniques have been proposed. They range from out-of-database approaches for only in-memory and external memory data to techniques that make use of standard database operators to answer similarity joins. Recent work has shown that this operation can be efficiently implemented as a physical database operator. However, the proposed operator only support 1D numeric data. This paper presents DBSimJoin, a physical Similarity Join database operator for datasets that lie in any metric space. DBSimJoin is a non-blocking operator that prioritizes the early generation of results. We implemented the proposed operator in PostgreSQL, an open source database system. We show how this operator can be used in multiple real-world data analysis scenarios with multiple data types and distance functions. Particularly, we show the use of DBSimJoin to identify similar images represented as feature vectors, and similar publications in a bibliographic database. We also show that DBSimJoin scales very well when important parameters, e.g., e, data size, increase.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gawade:2012:SPI, author = "Mrunal Gawade and Martin Kersten", title = "{Stethoscope}: a platform for interactive visual analysis of query execution plans", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1926--1929", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Searching for the performance bottleneck in an execution trace is an error prone and time consuming activity. Existing tools offer some comfort by providing a visual representation of trace for analysis. In this paper we present the Stethoscope, an interactive visual tool to inspect and analyze columnar database query performance, both online and offline. It's unique interactive animated interface capitalizes the large data-flow graph representation of a query execution plan, augmented with query execution trace information. We demonstrate features of Stethoscope for both online and offline analysis of long running queries. It helps in understanding where time goes, how optimizers perform, and how parallel processing on multi-core systems is exploited.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kotsifakos:2012:HSS, author = "Alexios Kotsifakos and Panagiotis Papapetrou and Jaakko Hollm{\'e}n and Dimitrios Gunopulos and Vassilis Athitsos and George Kollios", title = "{Hum-a-song}: a subsequence matching with gaps-range-tolerances query-by-humming system", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1930--1933", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present ``Hum-a-song'', a system built for music retrieval, and particularly for the Query-By-Humming (QBH) application. According to QBH, the user is able to hum a part of a song that she recalls and would like to learn what this song is, or find other songs similar to it in a large music repository. We present a simple yet efficient approach that maps the problem to time series subsequence matching. The query and the database songs are represented as 2-dimensional time series conveying information about the pitch and the duration of the notes. Then, since the query is a short sequence and we want to find its best match that may start and end anywhere in the database, subsequence matching methods are suitable for this task. In this demo, we present a system that employs and exposes to the user a variety of state-of-the-art dynamic programming methods, including a newly proposed efficient method named SMBGT that is robust to noise and considers all intrinsic problems in QBH; it allows variable tolerance levels when matching elements, where tolerances are defined as functions of the compared sequences, gaps in both the query and target sequences, and bounds the matching length and (optionally) the minimum number of matched elements. Our system is intended to become open source, which is to the best of our knowledge the first non-commercial effort trying to solve QBH with a variety of methods, and that also approaches the problem from the time series perspective.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kwon:2012:SAM, author = "YongChul Kwon and Magdalena Balazinska and Bill Howe and Jerome Rolia", title = "{SkewTune} in action: mitigating skew in {MapReduce} applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1934--1937", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate SkewTune, a system that automatically mitigates skew in user-defined MapReduce programs and is a drop-in replacement for Hadoop. The demonstration has two parts. First, we demonstrate how SkewTune mitigates skew in real MapReduce applications at runtime by running a real application in a public cloud. Second, through an interactive graphical interface, we demonstrate the details of the skew mitigation process using both real and synthetic workloads that represent various skew configurations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abouzied:2012:PQS, author = "Azza Abouzied and Joseph M. Hellerstein and Avi Silberschatz", title = "Playful query specification with {DataPlay}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1938--1941", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "DataPlay is a query tool that encourages a trial-and-error approach to query specification. DataPlay uses a graphical query language to make a particularly challenging query specification task --- quantification --- easier. It constrains the relational data model to enable the presentation of non-answers, in addition to answers, to aid query interpretation. Two novel features of DataPlay are suggesting semantic variations to a query and correcting queries by example. We introduce DataPlay as a sophisticated query specification tool and demonstrate its unique interaction models.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Alagiannis:2012:NAA, author = "Ioannis Alagiannis and Renata Borovica and Miguel Branco and Stratos Idreos and Anastasia Ailamaki", title = "{NoDB} in action: adaptive query processing on raw data", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1942--1945", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As data collections become larger and larger, users are faced with increasing bottlenecks in their data analysis. More data means more time to prepare the data, to load the data into the database and to execute the desired queries. Many applications already avoid using traditional database systems, e.g., scientific data analysis and social networks, due to their complexity and the increased data-to-query time, i.e. the time between getting the data and retrieving its first useful results. For many applications data collections keep growing fast, even on a daily basis, and this data deluge will only increase in the future, where it is expected to have much more data than what we can move or store, let alone analyze. In this demonstration, we will showcase a new philosophy for designing database systems called NoDB. NoDB aims at minimizing the data-to-query time, most prominently by removing the need to load data before launching queries. We will present our prototype implementation, PostgresRaw, built on top of PostgreSQL, which allows for efficient query execution over raw data files with zero initialization overhead. We will visually demonstrate how PostgresRaw incrementally and adaptively touches, parses, caches and indexes raw data files autonomously and exclusively as a side-effect of user queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wenzel:2012:CPQ, author = "Florian Wenzel and Markus Endres and Stefan Mandl and Werner Kie{\ss}ling", title = "Complex preference queries supporting spatial applications for user groups", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1946--1949", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Our demo application demonstrates a personalized location-based web application using Preference SQL that allows single users as well as groups of users to find accommodations in Istanbul that satisfy both hard constraints and user preferences. The application assists in defining spatial, numerical, and categorical base preferences and composes complex preference statements in an intuitive fashion. Unlike existing location-based services, the application considers spatial queries as soft instead of hard constraints to determine the best matches which are finally presented on a map. The underlying Preference SQL framework is implemented on top of a database, therefore enabling a seamless application integration with standard SQL back-end systems as well as efficient and extensible preference query processing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bakibayev:2012:DFQ, author = "Nurzhan Bakibayev and Dan Olteanu and Jakub Z{\'a}vodn{\'y}", title = "Demonstration of the {FDB} query engine for factorised databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1950--1953", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "FDB is an in-memory query engine for factorised databases, which are relational databases that use compact factorised representations at the physical layer to reduce data redundancy and boost query performance. We demonstrate FDB using real data sets from IMDB, DBLP, and the NELL repository of facts learned from Web pages. The users can inspect factorisations as well as plans used by FDB to compute factorised results of select-project-join queries on factorised databases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xu:2012:PRD, author = "Zichen Xu and Yi-Cheng Tu and Xiaorui Wang", title = "{PET}: reducing database energy cost via query optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1954--1957", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Energy conservation is a growing important issue in designing modern database management system (DBMS). This requires a deep thinking about the tradeoffs between energy and performance. Despite the significant amount of efforts at the hardware level to make the major components consume less energy, we argue for a revisit of the DBMS query processing mechanism to identify and harvest the potential of energy saving. However, the state-of-art architecture of DBMS does not take energy usage into consideration in its design. A major challenge in developing an energy-aware DBMS is to design and implement a cost-based query optimizer that evaluates query plans by both performance and energy costs. By following such a strategy, our previous work revealed the fact that energy-efficient query plans do not necessarily have the shortest processing time. This demo proposal introduces PET --- an energy-aware query optimization framework that is built as a part of the PostgreSQL kernel. PET, via its power cost estimation module and plan evaluation model, enables the database system to run under a DBA-specified energy/performance tradeoff level. PET contains a power cost estimator that can accurately estimate the power cost of query plans at compile time, and a query evaluation engine that the DBA could configure key PET parameters towards the desired tradeoff. The software to be demonstrated will also include workload engine for producing large quantities of queries and data sets. Our demonstration will show how PET functions via a comprehensive set of views from its graphical user interface named PET Viewer. Through such interfaces, a user can achieve a good understanding of the energy-related query optimization and cost-based plan generation. Users are also allowed to interact with PET to experience the different energy/performance tradeoffs by changing PET and workload parameters at query runtime.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Letelier:2012:SSA, author = "Andr{\'e}s Letelier and Jorge P{\'e}rez and Reinhard Pichler and Sebastian Skritek", title = "{SPAM}: a {SPARQL} analysis and manipulation tool", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1958--1961", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "SQL developers are used to having elaborate tools which help them in writing queries. In contrast, the creation of tools to assist users in the development of SPARQL queries is still in its infancy. In this system demo, we present the SPARQL Analysis and Manipulation (SPAM) tool, which provides help for the development of SPARQL queries. The main features of the SPAM tool comprise an editor with both text and graphical interface, as well as various functions for the static and dynamic analysis of SPARQL queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Koutris:2012:QDP, author = "Paraschos Koutris and Prasang Upadhyaya and Magdalena Balazinska and Bill Howe and Dan Suciu", title = "{QueryMarket} demonstration: pricing for online data markets", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1962--1965", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Increasingly data is being bought and sold online. To facilitate such transactions, online data market-places have emerged to provide a service for sellers to price views on their data, and buyers to buy such views. These marketplaces neither support the sale of ad-hoc queries (that are not one of the specified views), nor do they support queries that join datasets. We present QueryMarket, a prototype data marketplace that automatically extrapolates prices to ad-hoc queries, including those with joins, from the manually priced views. We call this capability ``query-based pricing'' and describe how it is superior to existing pricing methods, and how it provides more flexible pricing for the sellers. We then show how QueryMarket implements query-based pricing and how it generates explanations for the prices it computes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Luo:2012:DSD, author = "Siqiang Luo and Yifeng Luo and Shuigeng Zhou and Gao Cong and Jihong Guan", title = "{DISKs}: a system for distributed spatial group keyword search on road networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1966--1969", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query (e.g., shortest path) on road networks has been extensively studied. Although most of the existing query processing approaches are designed for centralized environments, there is a growing need to handle queries on road networks in distributed environments due to the increasing query workload and the challenge of querying large networks. In this demonstration, we showcase a distributed system called {DISKs} (DIstributed Spatial Keyword search) that is capable of efficiently supporting spatial group keyword search (S-GKS) on road networks. Given a group of keywords $X$ and a distance $r$, an SGKS returns locations on a road network, such that for each returned location $p$, there exists a set of nodes (on the road network), which are located within a network distance $r$ from $p$ and collectively contains $X$. We will demonstrate the innovative modules, performance and interactive user interfaces of DISKs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Endrullis:2012:WEM, author = "Stefan Endrullis and Andreas Thor and Erhard Rahm", title = "{WETSUIT}: an efficient mashup tool for searching and fusing web entities", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1970--1973", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate a new powerful mashup tool called WETSUIT (Web EnTity Search and fUsIon Tool) to search and integrate web data from diverse sources and domain-specific entity search engines. WETSUIT supports adaptive search strategies to query sets of relevant entities with a minimum of communication overhead. Mashups can be composed using a set of high-level operators based on the Java-compatible language Scala. The operator implementation supports a high degree of parallel processing, in particular a streaming of entities between all data transformation operations facilitating a fast presentation of intermediate results. WETSUIT has already been applied to solve challenging integration tasks from different domains.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Khalefa:2012:MBI, author = "Mohamed E. Khalefa and Ulrike Fischer and Torben Bach Pedersen and Wolfgang Lehner", title = "Model-based integration of past \& future in {TimeTravel}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1974--1977", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate TimeTravel, an efficient DBMS system for seamless integrated querying of past and (forecasted) future values of time series, allowing the user to view past and future values as one joint time series. This functionality is important for advanced application domain like energy. The main idea is to compactly represent time series as models. By using models, the TimeTravel system answers queries approximately on past and future data with error guarantees (absolute error and confidence) one order of magnitude faster than when accessing the time series directly. In addition, it efficiently supports exact historical queries by only accessing relevant portions of the time series. This is unlike existing approaches, which access the entire time series to exactly answer the query. To realize this system, we propose a novel hierarchical model index structure. As real-world time series usually exhibits seasonal behavior, models in this index incorporate seasonality. To construct a hierarchical model index, the user specifies seasonality period, error guarantees levels, and a statistical forecast method. As time proceeds, the system incrementally updates the index and utilizes it to answer approximate and exact queries. TimeTravel is implemented into PostgreSQL, thus achieving complete user transparency at the query level. In the demo, we show the easy building of a hierarchical model index for a real-world time series and the effect of varying the error guarantees on the speed up of approximate and exact queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Eberius:2012:DEB, author = "Julian Eberius and Maik Thiele and Katrin Braunschweig and Wolfgang Lehner", title = "{DrillBeyond}: enabling business analysts to explore the {Web of Open Data}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1978--1981", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Following the Open Data trend, governments and public agencies have started making their data available on the Web and established platforms such as data.gov or data.un.org. These Open Data platforms provide a huge amount of data for various topics such as demographics, transport, finance or health in various data formats. One typical usage scenario for this kind of data is their integration into a database or data warehouse in order to apply data analytics. However, in today's business intelligence tools there is an evident lack of support for so-called situational or ad-hoc data integration. In this demonstration we will therefore present DrillBeyond, a novel database and information retrieval engine which allows users to query a local database as well as the Web of Open Data in a seamless and integrated way with standard SQL. The audience will be able to pose queries to our DrillBeyond system which will be answered partly from local data in the database and partly from datasets that originate from the Web of Data. We will show how such queries are divided into known and unknown parts and how missing attributes are mapped to open datasets. We will demonstrate the integration of the open datasets back into the DBMS in order to apply its analytical features.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nakashole:2012:DER, author = "Ndapandula Nakashole and Gerhard Weikum and Fabian Suchanek", title = "Discovering and exploring relations on the web", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1982--1985", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose a demonstration of PATTY, a system for learning semantic relationships from the Web. PATTY is a collection of relations learned automatically from text. It aims to be to patterns what WordNet is to words. The semantic types of PATTY relations enable advanced search over subject-predicate-object data. With the ongoing trends of enriching Web data (both text and tables) with entity-relationship-oriented semantic annotations, we believe a demo of the PATTY system will be of interest to the database community.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Thirumuruganathan:2012:MME, author = "Saravanan Thirumuruganathan and Mahashweta Das and Shrikant Desai and Sihem Amer-Yahia and Gautam Das and Cong Yu", title = "{MapRat}: meaningful explanation, interactive exploration and geo-visualization of collaborative ratings", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1986--1989", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Collaborative rating sites such as IMDB and Yelp have become rich resources that users consult to form judgments about and choose from among competing items. Most of these sites either provide a plethora of information for users to interpret all by themselves or a simple overall aggregate information. Such aggregates (e.g., average rating over all users who have rated an item, aggregates along pre-defined dimensions, etc.) can not help a user quickly decide the desirability of an item. In this paper, we build a system MapRat that allows a user to explore multiple carefully chosen aggregate analytic details over a set of user demographics that meaningfully explain the ratings associated with item(s) of interest. MapRat allows a user to systematically explore, visualize and understand user rating patterns of input item(s) so as to make an informed decision quickly. In the demo, participants are invited to explore collaborative movie ratings for popular movies.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Park:2012:DSD, author = "Hyunjung Park and Hector Garcia-Molina and Richard Pang and Neoklis Polyzotis and Aditya Parameswaran and Jennifer Widom", title = "{Deco}: a system for declarative crowdsourcing", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1990--1993", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Deco is a system that enables declarative crowdsourcing: answering SQL queries posed over data gathered from the crowd as well as existing relational data. Deco implements a novel push-pull hybrid execution model in order to support a flexible data model and a precise query semantics, while coping with the combination of latency, monetary cost, and uncertainty of crowdsourcing. We demonstrate Deco using two crowdsourcing platforms: Amazon Mechanical Turk and an in-house platform, to show how Deco provides a convenient means of collecting and querying crowdsourced data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Martens:2012:DAX, author = "Wim Martens and Matthias Niewerth and Frank Neven and Thomas Schwentick", title = "Developing and analyzing {XSDs} through {BonXai}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1994--1997", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "BonXai is a versatile schema specification language expressively equivalent to XML Schema. It is not intended as a replacement for XML Schema but it can serve as an additional, user-friendly front-end. It offers a simple way and a lightweight syntax to specify the context of elements based on regular expressions rather than on types. In this demo we show the front-end capabilities of BonXai and exemplify its potential to offer a novel way to view existing XML Schema Definitions. In particular, we present several usage scenarios specifically targeted to showcase the ease of specifying, modifying, and understanding XML Schema Definitions through BonXai.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Elmore:2012:IEG, author = "Aaron J. Elmore and Sudipto Das and Divyakant Agrawal and Amr {El Abbadi}", title = "{InfoPuzzle}: exploring group decision making in mobile peer-to-peer databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "1998--2001", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As Internet-based services and mobile computing devices, such as smartphones and tablets, become ubiquitous, society's reliance on them to accomplish critical and time-sensitive tasks, such as information dissemination and collaborative decision making, also increases. Dependence on these media magnifies the damage caused by their disruption, whether malicious or natural. For instance, a natural disaster disrupting cellular and Internet infrastructures impedes information spread, which in turn leads to chaos, both among the victims as well as the aid providers. Decentralized and ad-hoc mechanisms for information dissemination and decision making are paramount to help restore order. We demonstrate InfoPuzzle, a mobile peer-to-peer database that utilizes direct device communication to enable group decision making, or consensus, without reliance on centralized communication services. InfoPuzzle minimizes the system's resource consumption, to prolong the lifetime of the power constrained devices by minimizing communication overhead, computational complexity, and persistent storage size. Due to user mobility and the limited range of point-to-point communication, knowing the exact number of participants is impossible, and therefore traditional consensus or quorum protocols cannot be used. We rely of distinct counting techniques, probabilistic thresholds, and bounded time based approaches to reach agreement. In this demo, we will explore various challenges and heuristics in estimating group participation to aid users in reconciling consensus without centralized services.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xu:2012:MQG, author = "Jianqiu Xu and Ralf Hartmut G{\"u}ting", title = "Manage and query generic moving objects in {SECONDO}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "2002--2005", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demonstration, we introduce a system that is able to manage moving objects in all real world environments, e.g., road network, bus network and indoor. The complete trip of a person is managed by the system such as Walk, Car, Walk, and Indoor, where the precise locations of both outdoor and indoor movements are represented. Trajectories located in several environments are integrated into the same framework. The system supports the shortest path searching for start and end locations being in different environments, for example, from a room to a bus stop. A comprehensive and scalable set of moving objects is generated to simulate human movement in practice. Optimization methods are developed to efficiently answer novel queries regarding transportation modes and mobile environments. Most of these queries are not supported by existing methods because of the limitation of data representation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2012:CFH, author = "Pei Li and Christina Tziviskou and Haidong Wang and Xin Luna Dong and Xiaoguang Liu and Andrea Maurino and Divesh Srivastava", title = "{Chronos}: facilitating history discovery by linking temporal records", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "2006--2009", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many data sets contain temporal records over a long period of time; each record is associated with a time stamp and describes some aspects of a real-world entity at that particular time. From such data, users often wish to search for entities in a particular period and understand the history of one entity or all entities in the data set. A major challenge for enabling such search and exploration is to identify records that describe the same real-world entity over a long period of time; however, linking temporal records is hard given that the values that describe an entity can evolve over time (e.g., a person can move from one affiliation to another). We demonstrate the Chronos system which offers users the useful tool for finding real-world entities over time and understanding history of entities in the bibliography domain. The core of Chronos is a temporal record-linkage algorithm, which is tolerant to value evolution over time. Our algorithm can obtain an F-measure of over 0.9 in linking author records and fix errors made by DBLP. We show how Chronos allows users to explore the history of authors, and how it helps users understand our linkage results by comparing our results with those of existing systems, highlighting differences in the results, explaining our decisions to users, and answering ``what-if'' questions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Koubarakis:2012:TDP, author = "Manolis Koubarakis and Mihai Datcu and Charalambos Kontoes and Ugo {Di Giammatteo} and Stefan Manegold and Eva Klien", title = "{TELEIOS}: a database-powered virtual earth observatory", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "2010--2013", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "TELEIOS is a recent European project that addresses the need for scalable access to petabytes of Earth Observation data and the discovery and exploitation of knowledge that is hidden in them. TELEIOS builds on scientific database technologies (array databases, SciQL, data vaults) and Semantic Web technologies (stRDF and stSPARQL) implemented on top of a state of the art column store database system (MonetDB). We demonstrate a first prototype of the TELEIOS Virtual Earth Observatory (VEO) architecture, using a forest fire monitoring application as example.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dittrich:2012:EBD, author = "Jens Dittrich and Jorge-Arnulfo Quian{\'e}-Ruiz", title = "Efficient big data processing in {Hadoop MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "2014--2015", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This tutorial is motivated by the clear need of many organizations, companies, and researchers to deal with big data volumes efficiently. Examples include web analytics applications, scientific applications, and social networks. A popular data processing engine for big data is Hadoop MapReduce. Early versions of Hadoop MapReduce suffered from severe performance problems. Today, this is becoming history. There are many techniques that can be used with Hadoop MapReduce jobs to boost performance by orders of magnitude. In this tutorial we teach such techniques. First, we will briefly familiarize the audience with Hadoop MapReduce and motivate its use for big data processing. Then, we will focus on different data management techniques, going from job optimization to physical data organization like data layouts and indexes. Throughout this tutorial, we will highlight the similarities and differences between Hadoop MapReduce and Parallel DBMS. Furthermore, we will point out unresolved research problems and open issues.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shim:2012:MAB, author = "Kyuseok Shim", title = "{MapReduce} algorithms for big data analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "2016--2017", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "There is a growing trend of applications that should handle big data. However, analyzing big data is a very challenging problem today. For such applications, the MapReduce framework has recently attracted a lot of attention. Google's MapReduce or its open-source equivalent Hadoop is a powerful tool for building such applications. In this tutorial, we will introduce the MapReduce framework based on Hadoop, discuss how to design efficient MapReduce algorithms and present the state-of-the-art in MapReduce algorithms for data mining, machine learning and similarity joins. The intended audience of this tutorial is professionals who plan to design and develop MapReduce algorithms and researchers who should be aware of the state-of-the-art in MapReduce algorithms available today for big data analysis.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Getoor:2012:ERT, author = "Lise Getoor and Ashwin Machanavajjhala", title = "Entity resolution: theory, practice \& open challenges", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "2018--2019", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This tutorial brings together perspectives on ER from a variety of fields, including databases, machine learning, natural language processing and information retrieval, to provide, in one setting, a survey of a large body of work. We discuss both the practical aspects and theoretical underpinnings of ER. We describe existing solutions, current challenges, and open research problems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Schindler:2012:CND, author = "Jiri Schindler", title = "{I/O} characteristics of {NoSQL} databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "2020--2021", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The advent of the so-called NoSQL databases has brought about a new model of using storage systems. While traditional relational database systems took advantage of features offered by centrally-managed, enterprise-class storage arrays, the new generation of database systems with weaker data consistency models is content with using and managing locally attached individual storage devices and providing data reliability and availability through high-level software features and protocols. This work aims to review the architecture of several existing NoSQL DBs with an emphasis on how they organize and access data in the shared-nothing locally-attached storage model. It shows how these systems operate under typical workloads (new inserts and point and range queries), what access characteristics they exhibit to storage systems. Finally, it examines how several recently developed key/value stores, schema-free document storage systems, and extensible column stores organize data on local filesystems on top of directly-attached disks and what system features they must (re)implement in order to provide the expected data reliability.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sun:2012:MKI, author = "Yizhou Sun and Jiawei Han and Xifeng Yan and Philip S. Yu", title = "Mining knowledge from interconnected data: a heterogeneous information network analysis approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "2022--2023", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Most objects and data in the real world are interconnected, forming complex, heterogeneous but often semi-structured information networks. However, most people consider a database merely as a data repository that supports data storage and retrieval rather than one or a set of heterogeneous information networks that contain rich, inter-related, multi-typed data and information. Most network science researchers only study homogeneous networks, without distinguishing the different types of objects and links in the networks. In this tutorial, we view database and other interconnected data as heterogeneous information networks, and study how to leverage the rich semantic meaning of types of objects and links in the networks. We systematically introduce the technologies that can effectively and efficiently mine useful knowledge from such information networks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Prakash:2012:UMC, author = "B. Aditya Prakash and Christos Faloutsos", title = "Understanding and managing cascades on large graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "2024--2025", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "How do contagions spread in population networks? Which group should we market to, for maximizing product penetration? Will a given YouTube video go viral? Who are the best people to vaccinate? What happens when two products compete? The objective of this tutorial is to provide an intuitive and concise overview of most important theoretical results and algorithms to help us understand and manipulate such propagation-style processes on large networks. The tutorial contains three parts: (a) Theoretical results on the behavior of fundamental models; (b) Scalable Algorithms for changing the behavior of these processes e.g., for immunization, marketing etc.; and (c) Empirical Studies of diffusion on blogs and on-line websites like Twitter. The problems we focus on are central in surprisingly diverse areas: from computer science and engineering, epidemiology and public health, product marketing to information dissemination. Our emphasis is on intuition behind each topic, and guidelines for the practitioner.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dogac:2012:IES, author = "Asuman Dogac", title = "Interoperability in {eHealth} systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "2026--2027", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Interoperability in eHealth systems is important for delivering quality healthcare and reducing healthcare costs. Some of the important use cases include coordinating the care of chronic patients by enabling the co-operation of many different eHealth systems such as Electronic Health Record Systems (EHRs), Personal Health Record Systems (PHRs) and wireless medical sensor devices; enabling secondary use of EHRs for clinical research; being able to share life long EHRs among different healthcare providers. Although achieving eHealth interoperability is quite a challenge both because there are competing standards and clinical information itself is very complex, there have been a number of successful industry initiatives such as Integrating the Healthcare Enterprise (IHE) Profiles, as well as large scale deployments such as the National Health Information System of Turkey and the epSOS initiative for sharing Electronic Health Records and ePrescriptions in Europe. This article briefly describes the subjects discussed in the VLDB 2012 tutorial to provide an overview of the issues in eHealth interoperability describing the key technologies and standards, identifying important use cases and the associated research challenges and also describing some of the large scale deployments. The aim is to foster further interest in this area.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Agrawal:2012:SPP, author = "Divyakant Agrawal and Amr {El Abbadi} and Shiyuan Wang", title = "Secure and privacy-preserving data services in the cloud: a data centric view", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "2028--2029", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cloud computing becomes a successful paradigm for data computing and storage. Increasing concerns about data security and privacy in the cloud, however, have emerged. Ensuring security and privacy for data management and query processing in the cloud is critical for better and broader uses of the cloud. This tutorial covers some common cloud security and privacy threats and the relevant research, while focusing on the works that protect data confidentiality and query access privacy for sensitive data being stored and queried in the cloud. We provide a comprehensive study of state-of-the-art schemes and techniques for protecting data confidentiality and access privacy, which make different tradeoffs in the multidimensional space of security, privacy, functionality and performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Guha:2012:GSS, author = "Sudipto Guha and Andrew McGregor", title = "Graph synopses, sketches, and streams: a survey", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "2030--2031", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Massive graphs arise in any application where there is data about both basic entities and the relationships between these entities, e.g., web-pages and hyperlinks; neurons and synapses; papers and citations; IP addresses and network flows; people and their friendships. Graphs have also become the de facto standard for representing many types of highly structured data. However, the sheer size of many of these graphs renders classical algorithms inapplicable when it comes to analyzing such graphs. In addition, these existing algorithms are typically ill-suited to processing distributed or stream data. Various platforms have been developed for processing large data sets. At the same time, there is the need to develop new algorithmic ideas and paradigms. In the case of graph processing, a lot of recent work has focused on understanding the important algorithmic issues. An central aspect of this is the question of how to construct and leverage small-space synopses in graph processing. The goal of this tutorial is to survey recent work on this question and highlight interesting directions for future research.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Labrinidis:2012:COB, author = "Alexandros Labrinidis and H. V. Jagadish", title = "Challenges and opportunities with big data", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "2032--2033", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The promise of data-driven decision-making is now being recognized broadly, and there is growing enthusiasm for the notion of ``Big Data,'' including the recent announcement from the White House about new funding initiatives across different agencies, that target research for Big Data. While the promise of Big Data is real --- for example, it is estimated that Google alone contributed 54 billion dollars to the US economy in 2009 --- there is no clear consensus on what is Big Data. In fact, there have been many controversial statements about Big Data, such as ``Size is the only thing that matters.'' In this panel we will try to explore the controversies and debunk the myths surrounding Big Data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{ElAbbadi:2012:PDS, author = "Amr {El Abbadi} and Mohamed F. Mokbel", title = "Panel discussion on social networks and mobility in the cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "5", number = "12", pages = "2034--2035", month = aug, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 6 16:43:21 MST 2012", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Social networks, mobility and the cloud represent special and unique opportunities for synergy among several existing and emerging communities that are now often evolving in isolated silos. All three areas hold much promise for the future of computing, and represent significant challenges for large scale data management. As these three areas evolve, their direct influence on significant decisions on each other becomes evident and critical. This panel will bring together a set of renowned researchers who will explore and discuss the synergy and tensions among critical and often intertwined research and application issues that arise in the context of social networks and mobility in a cloud infrastructure setting.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bouros:2012:STS, author = "Panagiotis Bouros and Shen Ge and Nikos Mamoulis", title = "Spatio-textual similarity joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "1", pages = "1--12", month = nov, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 22 12:18:56 MDT 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a collection of objects that carry both spatial and textual information, a spatio-textual similarity join retrieves the pairs of objects that are spatially close and textually similar. As an example, consider a social network with spatially and textually tagged persons (i.e., their locations and profiles). A useful task (for friendship recommendation) would be to find pairs of persons that are spatially close and their profiles have a large overlap (i.e., they have common interests). Another application is data de-duplication (e.g., finding photographs which are spatially close to each other and high overlap in their descriptive tags). Despite the importance of this operation, there is very little previous work that studies its efficient evaluation and in fact under a different definition; only the best match for each object is identified. In this paper, we combine ideas from state-of-the-art spatial distance join and set similarity join methods and propose efficient algorithms that take into account both spatial and textual constraints. Besides, we propose a batch processing technique which boosts the performance of our approaches. An experimental evaluation using real and synthetic datasets shows that our optimized techniques are orders of magnitude faster than base-line solutions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Drosou:2012:DDR, author = "Marina Drosou and Evaggelia Pitoura", title = "{DisC} diversity: result diversification based on dissimilarity and coverage", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "1", pages = "13--24", month = nov, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 22 12:18:56 MDT 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently, result diversification has attracted a lot of attention as a means to improve the quality of results retrieved by user queries. In this paper, we propose a new, intuitive definition of diversity called DisC diversity. A DisC diverse subset of a query result contains objects such that each object in the result is represented by a similar object in the diverse subset and the objects in the diverse subset are dissimilar to each other. We show that locating a minimum DisC diverse subset is an NP-hard problem and provide heuristics for its approximation. We also propose adapting DisC diverse subsets to a different degree of diversification. We call this operation zooming. We present efficient implementations of our algorithms based on the M-tree, a spatial index structure, and experimentally evaluate their performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zeng:2012:DPF, author = "Chen Zeng and Jeffrey F. Naughton and Jin-Yi Cai", title = "On differentially private frequent itemset mining", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "1", pages = "25--36", month = nov, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 22 12:18:56 MDT 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We consider differentially private frequent itemset mining. We begin by exploring the theoretical difficulty of simultaneously providing good utility and good privacy in this task. While our analysis proves that in general this is very difficult, it leaves a glimmer of hope in that our proof of difficulty relies on the existence of long transactions (that is, transactions containing many items). Accordingly, we investigate an approach that begins by truncating long transactions, trading off errors introduced by the truncation with those introduced by the noise added to guarantee privacy. Experimental results over standard benchmark databases show that truncating is indeed effective. Our algorithm solves the ``classical'' frequent itemset mining problem, in which the goal is to find all itemsets whose support exceeds a threshold. Related work has proposed differentially private algorithms for the top-$k$ itemset mining problem (``find the $k$ most frequent itemsets''.) An experimental comparison with those algorithms show that our algorithm achieves better $F$-score unless $k$ is small.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dong:2012:LMS, author = "Xin Luna Dong and Barna Saha and Divesh Srivastava", title = "Less is more: selecting sources wisely for integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "2", pages = "37--48", month = dec, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:14 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We are often thrilled by the abundance of information surrounding us and wish to integrate data from as many sources as possible. However, understanding, analyzing, and using these data are often hard. Too much data can introduce a huge integration cost, such as expenses for purchasing data and resources for integration and cleaning. Furthermore, including low-quality data can even deteriorate the quality of integration results instead of bringing the desired quality gain. Thus, ``the more the better'' does not always hold for data integration and often ``less is more''. In this paper, we study how to select a subset of sources before integration such that we can balance the quality of integrated data and integration cost. Inspired by the Marginalism principle in economic theory, we wish to integrate a new source only if its marginal gain, often a function of improved integration quality, is higher than the marginal cost, associated with data-purchase expense and integration resources. As a first step towards this goal, we focus on data fusion tasks, where the goal is to resolve conflicts from different sources. We propose a randomized solution for selecting sources for fusion and show empirically its effectiveness and scalability on both real-world data and synthetic data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhou:2012:DTA, author = "Wenchao Zhou and Suyog Mapara and Yiqing Ren and Yang Li and Andreas Haeberlen and Zachary Ives and Boon Thau Loo and Micah Sherr", title = "Distributed time-aware provenance", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "2", pages = "49--60", month = dec, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:14 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The ability to reason about changes in a distributed system's state enables network administrators to better diagnose protocol misconfigurations, detect intrusions, and pinpoint performance bottlenecks. We propose a novel provenance model called Distributed Time-aware Provenance (DTaP) that aids forensics and debugging in distributed systems by explicitly representing time, distributed state, and state changes. Using a distributed Datalog abstraction for modeling distributed protocols, we prove that the DTaP model provides a sound and complete representation that correctly captures dependencies among events in a distributed system. We additionally introduce DistTape, an implementation of the DTaP model that uses novel distributed storage structures, query processing, and cost-based optimization techniques to efficiently query time-aware provenance in a distributed setting. Using two example systems (declarative network routing and Hadoop MapReduce), we demonstrate that DistTape can efficiently maintain and query time-aware provenance at low communication and computation cost.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Calvanese:2012:QPU, author = "Diego Calvanese and Giuseppe {De Giacomo} and Maurizio Lenzerini and Moshe Y. Vardi", title = "Query processing under {GLAV} mappings for relational and graph databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "2", pages = "61--72", month = dec, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:14 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Schema mappings establish a correspondence between data stored in two databases, called source and target respectively. Query processing under schema mappings has been investigated extensively in the two cases where each target atom is mapped to a query over the source (called GAV, global-as-view), and where each source atom is mapped to a query over the target (called LAV, local-as-view). The general case, called GLAV, in which queries over the source are mapped to queries over the target, has attracted a lot of attention recently, especially for data exchange. However, query processing for GLAV mappings has been considered only for the basic service of query answering, and mainly in the context of conjunctive queries (CQs) in relational databases. In this paper we study query processing for GLAV mappings in a wider sense, considering not only query answering, but also query rewriting, perfectness (the property of a rewriting to compute exactly the certain answers), and query containment relative to a mapping. We deal both with the relational case, and with graph databases, where the basic querying mechanism is that of regular path queries. Query answering in GLAV can be smoothly reduced to a combination of the LAV and GAV cases, and for CQs this reduction can be exploited also for the remaining query processing tasks. In contrast, as we show, GLAV query processing for graph databases is non-trivial and requires new insights and techniques. We obtain upper bounds for answering, rewriting, and perfectness, and show decidability of relative containment.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mouratidis:2012:CIR, author = "Kyriakos Mouratidis and HweeHwa Pang", title = "Computing immutable regions for subspace top-$k$ queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "2", pages = "73--84", month = dec, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:14 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a high-dimensional dataset, a top-$k$ query can be used to shortlist the $k$ tuples that best match the user's preferences. Typically, these preferences regard a subset of the available dimensions (i.e., attributes) whose relative significance is expressed by user-specified weights. Along with the query result, we propose to compute for each involved dimension the maximal deviation to the corresponding weight for which the query result remains valid. The derived weight ranges, called immutable regions, are useful for performing sensitivity analysis, for fine-tuning the query weights, etc. In this paper, we focus on top-$k$ queries with linear preference functions over the queried dimensions. We codify the conditions under which changes in a dimension's weight invalidate the query result, and develop algorithms to compute the immutable regions. In general, this entails the examination of numerous non-result tuples. To reduce processing time, we introduce a pruning technique and a thresholding mechanism that allow the immutable regions to be determined correctly after examining only a small number of non-result tuples. We demonstrate empirically that the two techniques combine well to form a robust and highly resource-efficient algorithm. We verify the generality of our findings using real high-dimensional data from different domains (documents, images, etc) and with different characteristics.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhao:2012:LSC, author = "Feng Zhao and Anthony K. H. Tung", title = "Large scale cohesive subgraphs discovery for social network visual analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "2", pages = "85--96", month = dec, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:14 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graphs are widely used in large scale social network analysis nowadays. Not only analysts need to focus on cohesive subgraphs to study patterns among social actors, but also normal users are interested in discovering what happening in their neighborhood. However, effectively storing large scale social network and efficiently identifying cohesive subgraphs is challenging. In this work we introduce a novel subgraph concept to capture the cohesion in social interactions, and propose an I/O efficient approach to discover cohesive subgraphs. Besides, we propose an analytic system which allows users to perform intuitive, visual browsing on large scale social networks. Our system stores the network as a social graph in the graph database, retrieves a local cohesive subgraph based on the input keywords, and then hierarchically visualizes the subgraph out on orbital layout, in which more important social actors are located in the center. By summarizing textual interactions between social actors as tag cloud, we provide a way to quickly locate active social communities and their interactions in a unified view.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2012:TFD, author = "Xian Li and Xin Luna Dong and Kenneth Lyons and Weiyi Meng and Divesh Srivastava", title = "Truth finding on the {Deep Web}: is the problem solved?", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "2", pages = "97--108", month = dec, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:14 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The amount of useful information available on the Web has been growing at a dramatic pace in recent years and people rely more and more on the Web to fulfill their information needs. In this paper, we study truthfulness of Deep Web data in two domains where we believed data are fairly clean and data quality is important to people's lives: Stock and Flight. To our surprise, we observed a large amount of inconsistency on data from different sources and also some sources with quite low accuracy. We further applied on these two data sets state-of-the-art data fusion methods that aim at resolving conflicts and finding the truth, analyzed their strengths and limitations, and suggested promising research directions. We wish our study can increase awareness of the seriousness of conflicting data on the Web and in turn inspire more research in our community to tackle this problem.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Marcus:2012:CC, author = "Adam Marcus and David Karger and Samuel Madden and Robert Miller and Sewoong Oh", title = "Counting with the crowd", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "2", pages = "109--120", month = dec, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:14 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we address the problem of selectivity estimation in a crowdsourced database. Specifically, we develop several techniques for using workers on a crowdsourcing platform like Amazon's Mechanical Turk to estimate the fraction of items in a dataset (e.g., a collection of photos) that satisfy some property or predicate (e.g., photos of trees). We do this without explicitly iterating through every item in the dataset. This is important in crowd-sourced query optimization to support predicate ordering and in query evaluation, when performing a GROUP BY operation with a COUNT or AVG aggregate. We compare sampling item labels, a traditional approach, to showing workers a collection of items and asking them to estimate how many satisfy some predicate. Additionally, we develop techniques to eliminate spammers and colluding attackers trying to skew selectivity estimates when using this count estimation approach. We find that for images, counting can be much more effective than sampled labeling, reducing the amount of work necessary to arrive at an estimate that is within 1\% of the true fraction by up to an order of magnitude, with lower worker latency. We also find that sampled labeling outperforms count estimation on a text processing task, presumably because people are better at quickly processing large batches of images than they are at reading strings of text. Our spammer detection technique, which is applicable to both the label- and count-based approaches, can improve accuracy by up to two orders of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zou:2012:CDA, author = "Tao Zou and Ronan {Le Bras} and Marcos {Vaz Salles} and Alan Demers and Johannes Gehrke", title = "{ClouDiA}: a deployment advisor for public clouds", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "2", pages = "121--132", month = dec, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:14 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "An increasing number of distributed data-driven applications are moving into shared public clouds. By sharing resources and operating at scale, public clouds promise higher utilization and lower costs than private clusters. To achieve high utilization, however, cloud providers inevitably allocate virtual machine instances noncontiguously, i.e., instances of a given application may end up in physically distant machines in the cloud. This allocation strategy can lead to large differences in average latency between instances. For a large class of applications, this difference can result in significant performance degradation, unless care is taken in how application components are mapped to instances. In this paper, we propose ClouDiA, a general deployment advisor that selects application node deployments minimizing either (i) the largest latency between application nodes, or (ii) the longest critical path among all application nodes. ClouDiA employs mixed-integer programming and constraint programming techniques to efficiently search the space of possible mappings of application nodes to instances. Through experiments with synthetic and real applications in Amazon EC2, we show that our techniques yield a 15\% to 55\% reduction in time-to-solution or service response time, without any need for modifying application code.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lee:2012:DCS, author = "Jinsoo Lee and Wook-Shin Han and Romans Kasperovics and Jeong-Hoon Lee", title = "An in-depth comparison of subgraph isomorphism algorithms in graph databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "2", pages = "133--144", month = dec, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:14 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Finding subgraph isomorphisms is an important problem in many applications which deal with data modeled as graphs. While this problem is NP-hard, in recent years, many algorithms have been proposed to solve it in a reasonable time for real datasets using different join orders, pruning rules, and auxiliary neighborhood information. However, since they have not been empirically compared one another in most research work, it is not clear whether the later work outperforms the earlier work. Another problem is that reported comparisons were often done using the original authors' binaries which were written in different programming environments. In this paper, we address these serious problems by re-implementing five state-of-the-art subgraph isomorphism algorithms in a common code base and by comparing them using many real-world datasets and their query loads. Through our in-depth analysis of experimental results, we report surprising empirical findings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ren:2012:LLM, author = "Kun Ren and Alexander Thomson and Daniel J. Abadi", title = "Lightweight locking for main memory database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "2", pages = "145--156", month = dec, year = "2012", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:14 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Locking is widely used as a concurrency control mechanism in database systems. As more OLTP databases are stored mostly or entirely in memory, transactional throughput is less and less limited by disk IO, and lock managers increasingly become performance bottlenecks. In this paper, we introduce very lightweight locking (VLL), an alternative approach to pessimistic concurrency control for main-memory database systems that avoids almost all overhead associated with traditional lock manager operations. We also propose a protocol called selective contention analysis (SCA), which enables systems implementing VLL to achieve high transactional throughput under high contention workloads. We implement these protocols both in a traditional single-machine multi-core database server setting and in a distributed database where data is partitioned across many commodity machines in a shared-nothing cluster. Our experiments show that VLL dramatically reduces locking overhead and thereby increases transactional throughput in both settings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2013:LPP, author = "Ye Zhang and Wai-Kit Wong and S. M. Yiu and Nikos Mamoulis and David W. Cheung", title = "Lightweight privacy-preserving peer-to-peer data integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "3", pages = "157--168", month = jan, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:18 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Peer Data Management Systems (PDMS) are an attractive solution for managing distributed heterogeneous information. When a peer (client) requests data from another peer (server) with a different schema, translations of the query and its answer are done by a sequence of intermediate peers (translators). There are two privacy issues in this P2P data integration process: (i) answer privacy: no unauthorized parties (including the translators) should learn the query result; (ii) mapping privacy: the schema and the value mappings used by the translators to perform the translation should not be revealed to other peers. Elmeleegy and Ouzzani proposed the PPP protocol that is the first to support privacy-preserving querying in PDMS. However, PPP suffers from several shortcomings. First, PPP does not satisfy the requirement of answer privacy, because it is based on commutative encryption; we show that this issue can be fixed by adopting another cryptographic technique called oblivious transfer. Second, PPP adopts a weaker notion for mapping privacy, which allows the client peer to observe certain mappings done by translators. In this paper, we develop a lightweight protocol, which satisfies mapping privacy and extend it to a more complex one that facilitates parallel translation by peers. Furthermore, we consider a stronger adversary model where there may be collusions among peers and propose an efficient protocol that guards against collusions. We conduct an experimental study on the performance of the proposed protocols using both real and synthetic data. The results show that the proposed protocols not only achieve a better privacy guarantee than PPP, but they are also more efficient.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2013:MEM, author = "Yang Li and Pegah Kamousi and Fangqiu Han and Shengqi Yang and Xifeng Yan and Subhash Suri", title = "Memory efficient minimum substring partitioning", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "3", pages = "169--180", month = jan, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:18 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Massively parallel DNA sequencing technologies are revolutionizing genomics research. Billions of short reads generated at low costs can be assembled for reconstructing the whole genomes. Unfortunately, the large memory footprint of the existing de novo assembly algorithms makes it challenging to get the assembly done for higher eukaryotes like mammals. In this work, we investigate the memory issue of constructing de Bruijn graph, a core task in leading assembly algorithms, which often consumes several hundreds of gigabytes memory for large genomes. We propose a disk-based partition method, called Minimum Substring Partitioning (MSP), to complete the task using less than 10 gigabytes memory, without runtime slowdown. MSP breaks the short reads into multiple small disjoint partitions so that each partition can be loaded into memory, processed individually and later merged with others to form a de Bruijn graph. By leveraging the overlaps among the $k$-mers (substring of length k), MSP achieves astonishing compression ratio: The total size of partitions is reduced from $ \Theta (k n) $ to $ \Theta (n) $, where $n$ is the size of the short read database, and $k$ is the length of a $k$-mer. Experimental results show that our method can build de Bruijn graphs using a commodity computer for any large-volume sequence dataset.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Khan:2013:NFG, author = "Arijit Khan and Yinghui Wu and Charu C. Aggarwal and Xifeng Yan", title = "{NeMa}: fast graph search with label similarity", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "3", pages = "181--192", month = jan, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:18 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "It is increasingly common to find real-life data represented as networks of labeled, heterogeneous entities. To query these networks, one often needs to identify the matches of a given query graph in a (typically large) network modeled as a target graph. Due to noise and the lack of fixed schema in the target graph, the query graph can substantially differ from its matches in the target graph in both structure and node labels, thus bringing challenges to the graph querying tasks. In this paper, we propose NeMa (Network Match), a neighborhood-based subgraph matching technique for querying real-life networks. (1) To measure the quality of the match, we propose a novel subgraph matching cost metric that aggregates the costs of matching individual nodes, and unifies both structure and node label similarities. (2) Based on the metric, we formulate the minimum cost subgraph matching problem. Given a query graph and a target graph, the problem is to identify the (top-$k$) matches of the query graph with minimum costs in the target graph. We show that the problem is NP-hard, and also hard to approximate. (3) We propose a heuristic algorithm for solving the problem based on an inference model. In addition, we propose optimization techniques to improve the efficiency of our method. (4) We empirically verify that NeMa is both effective and efficient compared to the keyword search and various state-of-the-art graph querying techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lin:2013:PPS, author = "Xika Lin and Abhishek Mukherji and Elke A. Rundensteiner and Carolina Ruiz and Matthew O. Ward", title = "{PARAS}: a parameter space framework for online association mining", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "3", pages = "193--204", month = jan, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:18 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Association rule mining is known to be computationally intensive, yet real-time decision-making applications are increasingly intolerant to delays. In this paper, we introduce the parameter space model, called PARAS. PARAS enables efficient rule mining by compactly maintaining the final rulesets. The PARAS model is based on the notion of stable region abstractions that form the coarse granularity ruleset space. Based on new insights on the redundancy relationships among rules, PARAS establishes a surprisingly compact representation of complex redundancy relationships while enabling efficient redundancy resolution at query-time. Besides the classical rule mining requests, the PARAS model supports three novel classes of exploratory queries. Using the proposed PSpace index, these exploratory query classes can all be answered with near real-time responsiveness. Our experimental evaluation using several benchmark datasets demonstrates that PARAS achieves 2 to 5 orders of magnitude improvement over state-of-the-art approaches in online association rule mining.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yan:2013:ASF, author = "Zhepeng Yan and Nan Zheng and Zachary G. Ives and Partha Pratim Talukdar and Cong Yu", title = "Actively soliciting feedback for query answers in keyword search-based data integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "3", pages = "205--216", month = jan, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:18 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The problem of scaling up data integration, such that new sources can be quickly utilized as they are discovered, remains elusive: global schemas for integrated data are difficult to develop and expand, and schema and record matching techniques are limited by the fact that data and metadata are often under-specified and must be disambiguated by data experts. One promising approach is to avoid using a global schema, and instead to develop keyword search-based data integration--where the system lazily discovers associations enabling it to join together matches to keywords, and return ranked results. The user is expected to understand the data domain and provide feedback about answers' quality. The system generalizes such feedback to learn how to correctly integrate data. A major open challenge is that under this model, the user only sees and offers feedback on a few ``top-$k$'' results: this result set must be carefully selected to include answers of high relevance and answers that are highly informative when feedback is given on them. Existing systems merely focus on predicting relevance, by composing the scores of various schema and record matching algorithms. In this paper we show how to predict the uncertainty associated with a query result's score, as well as how informative feedback is on a given result. We build upon these foundations to develop an active learning approach to keyword search-based data integration, and we validate the effectiveness of our solution over real data from several very different domains.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2013:SKQ, author = "Lisi Chen and Gao Cong and Christian S. Jensen and Dingming Wu", title = "Spatial keyword query processing: an experimental evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "3", pages = "217--228", month = jan, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:18 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Geo-textual indices play an important role in spatial keyword querying. The existing geo-textual indices have not been compared systematically under the same experimental framework. This makes it difficult to determine which indexing technique best supports specific functionality. We provide an all-around survey of 12 state-of-the-art geo-textual indices. We propose a benchmark that enables the comparison of the spatial keyword query performance. We also report on the findings obtained when applying the benchmark to the indices, thus uncovering new insights that may guide index selection as well as further research.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Eftekhar:2013:PRT, author = "Milad Eftekhar and Nick Koudas", title = "Partitioning and ranking tagged data sources", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "4", pages = "229--240", month = feb, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:22 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Online types of expression in the form of social networks, micro-blogging, blogs and rich content sharing platforms have proliferated in the last few years. Such proliferation contributed to the vast explosion in online data sharing we are experiencing today. One unique aspect of online data sharing is tags manually inserted by content generators to facilitate content description and discovery (e.g., hashtags in tweets). In this paper we focus on these tags and we study and propose algorithms that make use of tags in order to automatically organize and categorize this vast collection of socially contributed and tagged information. In particular, we take a holistic approach in organizing such tags and we propose algorithms to partition as well as rank this information collection. Our partitioning algorithms aim to segment the entire collection of tags (and the associated content) into a specified number of partitions for specific problem constraints. In contrast our ranking algorithms aim to identify few partitions fast, for suitably defined ranking functions. We present a detailed experimental study utilizing the full twitter firehose (set of all tweets in the Twitter service) that attests to the practical utility and effectiveness of our overall approach. We also present a detailed qualitative study of our results.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Badia:2013:EIG, author = "Antonio Badia and Bin Cao", title = "Efficient implementation of generalized quantification in relational query languages", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "4", pages = "241--252", month = feb, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:22 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present research aimed at improving our understanding of the use and implementation of quantification in relational query languages in general and SQL in particular. In order to make our results as general as possible, we use the framework of Generalized Quantification. Generalized Quantifiers (GQs) are high-level, declarative logical operators that in the past have been studied from a theoretical perspective. In this paper we focus on their practical use, showing how to incorporate a dynamic set of GQs in relational query languages, how to implement them efficiently and use them in the context of SQL. We present experimental evidence of the performance of the approach, showing that it improves over traditional (relational) approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2013:DWD, author = "Rui Liu and Ashraf Aboulnaga and Kenneth Salem", title = "{DAX}: a widely distributed multitenant storage service for {DBMS} hosting", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "4", pages = "253--264", month = feb, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:22 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many applications hosted on the cloud have sophisticated data management needs that are best served by a SQL-based relational DBMS. It is not difficult to run a DBMS in the cloud, and in many cases one DBMS instance is enough to support an application's workload. However, a DBMS running in the cloud (or even on a local server) still needs a way to persistently store its data and protect it against failures. One way to achieve this is to provide a scalable and reliable storage service that the DBMS can access over a network. This paper describes such a service, which we call DAX. DAX relies on multi-master replication and Dynamo-style flexible consistency, which enables it to run in multiple data centers and hence be disaster tolerant. Flexible consistency allows DAX to control the consistency level of each read or write operation, choosing between strong consistency at the cost of high latency or weak consistency with low latency. DAX makes this choice for each read or write operation by applying protocols that we designed based on the storage tier usage characteristics of database systems. With these protocols, DAX provides a storage service that can host multiple DBMS tenants, scaling with the number of tenants and the required storage capacity and bandwidth. DAX also provides high availability and disaster tolerance for the DBMS storage tier. Experiments using the TPC-C benchmark show that DAX provides up to a factor of 4 performance improvement over baseline solutions that do not exploit flexible consistency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zeng:2013:DGE, author = "Kai Zeng and Jiacheng Yang and Haixun Wang and Bin Shao and Zhongyuan Wang", title = "A distributed graph engine for web scale {RDF} data", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "4", pages = "265--276", month = feb, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:22 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Much work has been devoted to supporting RDF data. But state-of-the-art systems and methods still cannot handle web scale RDF data effectively. Furthermore, many useful and general purpose graph-based operations (e.g., random walk, reachability, community discovery) on RDF data are not supported, as most existing systems store and index data in particular ways (e.g., as relational tables or as a bitmap matrix) to maximize one particular operation on RDF data: SPARQL query processing. In this paper, we introduce Trinity. RDF, a distributed, memory-based graph engine for web scale RDF data. Instead of managing the RDF data in triple stores or as bitmap matrices, we store RDF data in its native graph form. It achieves much better (sometimes orders of magnitude better) performance for SPARQL queries than the state-of-the-art approaches. Furthermore, since the data is stored in its native graph form, the system can support other operations (e.g., random walks, reachability) on RDF graphs as well. We conduct comprehensive experimental studies on real life, web scale RDF data to demonstrate the effectiveness of our approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sarma:2013:ULB, author = "Anish Das Sarma and Foto N. Afrati and Semih Salihoglu and Jeffrey D. Ullman", title = "Upper and lower bounds on the cost of a map-reduce computation", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "4", pages = "277--288", month = feb, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:22 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper we study the tradeoff between parallelism and communication cost in a map-reduce computation. For any problem that is not ``embarrassingly parallel,'' the finer we partition the work of the reducers so that more parallelism can be extracted, the greater will be the total communication between mappers and reducers. We introduce a model of problems that can be solved in a single round of map-reduce computation. This model enables a generic recipe for discovering lower bounds on communication cost as a function of the maximum number of inputs that can be assigned to one reducer. We use the model to analyze the tradeoff for three problems: finding pairs of strings at Hamming distance d, finding triangles and other patterns in a larger graph, and matrix multiplication. For finding strings of Hamming distance 1, we have upper and lower bounds that match exactly. For triangles and many other graphs, we have upper and lower bounds that are the same to within a constant factor. For the problem of matrix multiplication, we have matching upper and lower bounds for one-round map-reduce algorithms. We are also able to explore two-round map-reduce algorithms for matrix multiplication and show that these never have more communication, for a given reducer size, than the best one-round algorithm, and often have significantly less.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tu:2013:PAQ, author = "Stephen Tu and M. Frans Kaashoek and Samuel Madden and Nickolai Zeldovich", title = "Processing analytical queries over encrypted data", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "5", pages = "289--300", month = mar, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:27 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "MONOMI is a system for securely executing analytical workloads over sensitive data on an untrusted database server. MONOMI works by encrypting the entire database and running queries over the encrypted data. MONOMI introduces split client/server query execution, which can execute arbitrarily complex queries over encrypted data, as well as several techniques that improve performance for such workloads, including per-row precomputation, space-efficient encryption, grouped homomorphic addition, and pre-filtering. Since these optimizations are good for some queries but not others, MONOMI introduces a designer for choosing an efficient physical design at the server for a given workload, and a planner to choose an efficient execution plan for a given query at runtime. A prototype of MONOMI running on top of Postgres can execute most of the queries from the TPC-H benchmark with a median overhead of only $ 1.24 \times $ (ranging from $ 1.03 \times $ to $ 2.33 \times $) compared to an un-encrypted Postgres database where a compromised server would reveal all data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kellaris:2013:PDP, author = "Georgios Kellaris and Stavros Papadopoulos", title = "Practical differential privacy via grouping and smoothing", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "5", pages = "301--312", month = mar, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:27 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We address one-time publishing of non-overlapping counts with $ \epsilon $-differential privacy. These statistics are useful in a wide and important range of applications, including transactional, traffic and medical data analysis. Prior work on the topic publishes such statistics with prohibitively low utility in several practical scenarios. Towards this end, we present GS, a method that pre-processes the counts by elaborately grouping and smoothing them via averaging. This step acts as a form of preliminary perturbation that diminishes sensitivity, and enables GS to achieve $ \epsilon $-differential privacy through low Laplace noise injection. The grouping strategy is dictated by a sampling mechanism, which minimizes the smoothing perturbation. We demonstrate the superiority of GS over its competitors, and confirm its practicality, via extensive experiments on real datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kaushik:2013:SSD, author = "Raghav Kaushik and Yupeng Fu and Ravishankar Ramamurthy", title = "On scaling up sensitive data auditing", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "5", pages = "313--324", month = mar, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:27 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper studies the following problem: given (1) a query and (2) a set of sensitive records, find the subset of records ``accessed'' by the query. The notion of a query accessing a single record is adopted from prior work. There are several scenarios where the number of sensitive records is large (in the millions). The novel challenge addressed in this work is to develop a general-purpose solution for complex SQL that scales in the number of sensitive records. We propose efficient techniques that improves upon straightforward alternatives by orders of magnitude. Our empirical evaluation over the TPC-H benchmark data illustrates the benefits of our techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sathiamoorthy:2013:XEN, author = "Maheswaran Sathiamoorthy and Megasthenis Asteris and Dimitris Papailiopoulos and Alexandros G. Dimakis and Ramkumar Vadali and Scott Chen and Dhruba Borthakur", title = "{XORing} elephants: novel erasure codes for big data", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "5", pages = "325--336", month = mar, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:27 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed storage systems for large clusters typically use replication to provide reliability. Recently, erasure codes have been used to reduce the large storage overhead of three-replicated systems. Reed--Solomon codes are the standard design choice and their high repair cost is often considered an unavoidable price to pay for high storage efficiency and high reliability. This paper shows how to overcome this limitation. We present a novel family of erasure codes that are efficiently repairable and offer higher reliability compared to Reed--Solomon codes. We show analytically that our codes are optimal on a recently identified tradeoff between locality and minimum distance. We implement our new codes in Hadoop HDFS and compare to a currently deployed HDFS module that uses Reed--Solomon codes. Our modified HDFS implementation shows a reduction of approximately $ 2 \times $ on the repair disk I/O and repair network traffic. The disadvantage of the new coding scheme is that it requires 14\% more storage compared to Reed--Solomon codes, an overhead shown to be information theoretically optimal to obtain locality. Because the new codes repair failures faster, this provides higher reliability, which is orders of magnitude higher compared to replication.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rendle:2013:SFM, author = "Steffen Rendle", title = "Scaling factorization machines to relational data", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "5", pages = "337--348", month = mar, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:27 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The most common approach in predictive modeling is to describe cases with feature vectors (aka design matrix). Many machine learning methods such as linear regression or support vector machines rely on this representation. However, when the underlying data has strong relational patterns, especially relations with high cardinality, the design matrix can get very large which can make learning and prediction slow or even infeasible. This work solves this issue by making use of repeating patterns in the design matrix which stem from the underlying relational structure of the data. It is shown how coordinate descent learning and Bayesian Markov Chain Monte Carlo inference can be scaled for linear regression and factorization machine models. Empirically, it is shown on two large scale and very competitive datasets (Netflix prize, KDDCup 2012), that (1) standard learning algorithms based on the design matrix representation cannot scale to relational predictor variables, (2) the proposed new algorithms scale and (3) the predictive quality of the proposed generic feature-based approach is as good as the best specialized models that have been tailored to the respective tasks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Whang:2013:QSC, author = "Steven Euijong Whang and Peter Lofgren and Hector Garcia-Molina", title = "Question selection for crowd entity resolution", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "6", pages = "349--360", month = apr, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:32 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the problem of enhancing Entity Resolution (ER) with the help of crowdsourcing. ER is the problem of clustering records that refer to the same real-world entity and can be an extremely difficult process for computer algorithms alone. For example, figuring out which images refer to the same person can be a hard task for computers, but an easy one for humans. We study the problem of resolving records with crowdsourcing where we ask questions to humans in order to guide ER into producing accurate results. Since human work is costly, our goal is to ask as few questions as possible. We propose a probabilistic framework for ER that can be used to estimate how much ER accuracy we obtain by asking each question and select the best question with the highest expected accuracy. Computing the expected accuracy is \#P-hard, so we propose approximation techniques for efficient computation. We evaluate our best question algorithms on real and synthetic datasets and demonstrate how we can obtain high ER accuracy while significantly reducing the number of questions asked to humans.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jindal:2013:CKB, author = "Alekh Jindal and Endre Palatinus and Vladimir Pavlov and Jens Dittrich", title = "A comparison of knives for bread slicing", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "6", pages = "361--372", month = apr, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:32 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Vertical partitioning is a crucial step in physical database design in row-oriented databases. A number of vertical partitioning algorithms have been proposed over the last three decades for a variety of niche scenarios. In principle, the underlying problem remains the same: decompose a table into one or more vertical partitions. However, it is not clear how good different vertical partitioning algorithms are in comparison to each other. In fact, it is not even clear how to experimentally compare different vertical partitioning algorithms. In this paper, we present an exhaustive experimental study of several vertical partitioning algorithms. We categorize vertical partitioning algorithms along three dimensions. We survey six vertical partitioning algorithms and discuss their pros and cons. We identify the major differences in the use-case settings for different algorithms and describe how to make an apples-to-apples comparison of different vertical partitioning algorithms under the same setting. We propose four metrics to compare vertical partitioning algorithms. We show experimental results from the TPC-H and SSB benchmark and present four key lessons learned: (1) we can do four orders of magnitude less computation and still find the optimal layouts, (2) the benefits of vertical partitioning depend strongly on the database buffer size, (3) HillClimb is the best vertical partitioning algorithm, and (4) vertical partitioning for TPC-H-like benchmarks can improve over column layout by only up to 5\%.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xiao:2013:EET, author = "Chuan Xiao and Jianbin Qin and Wei Wang and Yoshiharu Ishikawa and Koji Tsuda and Kunihiko Sadakane", title = "Efficient error-tolerant query autocompletion", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "6", pages = "373--384", month = apr, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:32 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query autocompletion is an important feature saving users many keystrokes from typing the entire query. In this paper we study the problem of query autocompletion that tolerates errors in users' input using edit distance constraints. Previous approaches index data strings in a trie, and continuously maintain all the prefixes of data strings whose edit distance from the query are within the threshold. The major inherent problem is that the number of such prefixes is huge for the first few characters of the query and is exponential in the alphabet size. This results in slow query response even if the entire query approximately matches only few prefixes. In this paper, we propose a novel neighborhood generation-based algorithm, IncNGTrie, which can achieve up to two orders of magnitude speedup over existing methods for the error-tolerant query autocompletion problem. Our proposed algorithm only maintains a small set of active nodes, thus saving both space and time to process the query. We also study efficient duplicate removal which is a core problem in fetching query answers. In addition, we propose optimization techniques to reduce our index size, as well as discussions on several extensions to our method. The efficiency of our method is demonstrated against existing methods through extensive experiments on real datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shraer:2013:TKP, author = "Alexander Shraer and Maxim Gurevich and Marcus Fontoura and Vanja Josifovski", title = "Top-$k$ publish-subscribe for social annotation of news", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "6", pages = "385--396", month = apr, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:32 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Social content, such as Twitter updates, often have the quickest first-hand reports of news events, as well as numerous commentaries that are indicative of public view of such events. As such, social updates provide a good complement to professionally written news articles. In this paper we consider the problem of automatically annotating news stories with social updates (tweets), at a news website serving high volume of pageviews. The high rate of both the pageviews (millions to billions a day) and of the incoming tweets (more than 100 millions a day) make real-time indexing of tweets ineffective, as this requires an index that is both queried and updated extremely frequently. The rate of tweet updates makes caching techniques almost unusable since the cache would become stale very quickly. We propose a novel architecture where each story is treated as a subscription for tweets relevant to the story's content, and new algorithms that efficiently match tweets to stories, proactively maintaining the top-$k$ tweets for each story. Such top-$k$ pub-sub consumes only a small fraction of the resource cost of alternative solutions, and can be applicable to other large scale content-based publish-subscribe problems. We demonstrate the effectiveness of our approach on realworld data: a corpus of news stories from Yahoo! News and a log of Twitter updates.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kolaitis:2013:EQI, author = "Phokion G. Kolaitis and Enela Pema and Wang-Chiew Tan", title = "Efficient querying of inconsistent databases with binary integer programming", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "6", pages = "397--408", month = apr, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:32 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "An inconsistent database is a database that violates one or more integrity constraints. A typical approach for answering a query over an inconsistent database is to first clean the inconsistent database by transforming it to a consistent one and then apply the query to the consistent database. An alternative and more principled approach, known as consistent query answering, derives the answers to a query over an inconsistent database without changing the database, but by taking into account all possible repairs of the database. In this paper, we study the problem of consistent query answering over inconsistent databases for the class for conjunctive queries under primary key constraints. We develop a system, called EQUIP, that represents a fundamental departure from existing approaches for computing the consistent answers to queries in this class. At the heart of EQUIP is a technique, based on Binary Integer Programming (BIP), that repeatedly searches for repairs to eliminate candidate consistent answers until no further such candidates can be eliminated. We establish rigorously the correctness of the algorithms behind EQUIP and carry out an extensive experimental investigation that validates the effectiveness of our approach. Specifically, EQUIP exhibits good and stable performance on conjunctive queries under primary key constraints, it significantly outperforms existing systems for computing the consistent answers of such queries in the case in which the consistent answers are not first-order rewritable, and it scales well.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gionis:2013:PSN, author = "Aristides Gionis and Flavio Junqueira and Vincent Leroy and Marco Serafini and Ingmar Weber", title = "Piggybacking on social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "6", pages = "409--420", month = apr, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:32 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The popularity of social-networking sites has increased rapidly over the last decade. A basic functionalities of social-networking sites is to present users with streams of events shared by their friends. At a systems level, materialized per-user views are a common way to assemble and deliver such event streams on-line and with low latency. Access to the data stores, which keep the user views, is a major bottleneck of social-networking systems. We propose to improve the throughput of these systems by using social piggybacking, which consists of processing the requests of two friends by querying and updating the view of a third common friend. By using one such hub view, the system can serve requests of the first friend without querying or updating the view of the second. We show that, given a social graph, social piggybacking can minimize the overall number of requests, but computing the optimal set of hubs is an NP-hard problem. We propose an $ O(\log n) $ approximation algorithm and a heuristic to solve the problem, and evaluate them using the full Twitter and Flickr social graphs, which have up to billions of edges. Compared to existing approaches, using social piggybacking results in similar throughput in systems with few servers, but enables substantial throughput improvements as the size of the system grows, reaching up to a 2-factor increase. We also evaluate our algorithms on a real social networking system prototype and we show that the actual increase in throughput corresponds nicely to the gain anticipated by our cost function.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Adelfio:2013:SET, author = "Marco D. Adelfio and Hanan Samet", title = "Schema extraction for tabular data on the {Web}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "6", pages = "421--432", month = apr, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:32 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Tabular data is an abundant source of information on the Web, but remains mostly isolated from the latter's interconnections since tables lack links and computer-accessible descriptions of their structure. In other words, the schemas of these tables --- attribute names, values, data types, etc. --- are not explicitly stored as table metadata. Consequently, the structure that these tables contain is not accessible to the crawlers that power search engines and thus not accessible to user search queries. We address this lack of structure with a new method for leveraging the principles of table construction in order to extract table schemas. Discovering the schema by which a table is constructed is achieved by harnessing the similarities and differences of nearby table rows through the use of a novel set of features and a feature processing scheme. The schemas of these data tables are determined using a classification technique based on conditional random fields in combination with a novel feature encoding method called logarithmic binning, which is specifically designed for the data table extraction task. Our method provides considerable improvement over the well-known WebTables schema extraction method. In contrast with previous work that focuses on extracting individual relations, our method excels at correctly interpreting full tables, thereby being capable of handling general tables such as those found in spreadsheets, instead of being restricted to HTML tables as is the case with the WebTables method. We also extract additional schema characteristics, such as row groupings, which are important for supporting information retrieval tasks on tabular data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sariyuce:2013:SAK, author = "Ahmet Erdem Sar{\'\i}y{\"u}ce and Bugra Gedik and Gabriela Jacques-Silva and Kun-Lung Wu and {\"U}mit V. {\c{C}}ataly{\"u}rek", title = "Streaming algorithms for $k$-core decomposition", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "6", pages = "433--444", month = apr, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:32 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A $k$-core of a graph is a maximal connected subgraph in which every vertex is connected to at least $k$ vertices in the subgraph. $k$-core decomposition is often used in large-scale network analysis, such as community detection, protein function prediction, visualization, and solving NP-Hard problems on real networks efficiently, like maximal clique finding. In many real-world applications, networks change over time. As a result, it is essential to develop efficient incremental algorithms for streaming graph data. In this paper, we propose the first incremental $k$-core decomposition algorithms for streaming graph data. These algorithms locate a small subgraph that is guaranteed to contain the list of vertices whose maximum $k$-core values have to be updated, and efficiently process this subgraph to update the $k$-core decomposition. Our results show a significant reduction in run-time compared to non-incremental alternatives. We show the efficiency of our algorithms on different types of real and synthetic graphs, at different scales. For a graph of 16 million vertices, we observe speedups reaching a million times, relative to the non-incremental algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hassanzadeh:2013:DLP, author = "Oktie Hassanzadeh and Ken Q. Pu and Soheil Hassas Yeganeh and Ren{\'e}e J. Miller and Lucian Popa and Mauricio A. Hern{\'a}ndez and Howard Ho", title = "Discovering linkage points over {Web} data", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "6", pages = "445--456", month = apr, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:32 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A basic step in integration is the identification of linkage points, i.e., finding attributes that are shared (or related) between data sources, and that can be used to match records or entities across sources. This is usually performed using a match operator, that associates attributes of one database to another. However, the massive growth in the amount and variety of unstructured and semi-structured data on the Web has created new challenges for this task. Such data sources often do not have a fixed pre-defined schema and contain large numbers of diverse attributes. Furthermore, the end goal is not schema alignment as these schemas may be too heterogeneous (and dynamic) to meaningfully align. Rather, the goal is to align any overlapping data shared by these sources. We will show that even attributes with different meanings (that would not qualify as schema matches) can sometimes be useful in aligning data. The solution we propose in this paper replaces the basic schema-matching step with a more complex instance-based schema analysis and linkage discovery. We present a framework consisting of a library of efficient lexical analyzers and similarity functions, and a set of search algorithms for effective and efficient identification of linkage points over Web data. We experimentally evaluate the effectiveness of our proposed algorithms in real-world integration scenarios in several domains.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fu:2013:LIS, author = "Ada Wai-Chee Fu and Huanhuan Wu and James Cheng and Raymond Chi-Wing Wong", title = "{IS-Label}: an independent-set based labeling scheme for point-to-point distance querying", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "6", pages = "457--468", month = apr, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:32 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the problem of computing shortest path or distance between two query vertices in a graph, which has numerous important applications. Quite a number of indexes have been proposed to answer such distance queries. However, all of these indexes can only process graphs of size barely up to 1 million vertices, which is rather small in view of many of the fast-growing real-world graphs today such as social networks and Web graphs. We propose an efficient index, which is a novel labeling scheme based on the independent set of a graph. We show that our method can handle graphs of size orders of magnitude larger than existing indexes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tran:2013:SUD, author = "Thanh T. L. Tran and Yanlei Diao and Charles Sutton and Anna Liu", title = "Supporting user-defined functions on uncertain data", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "6", pages = "469--480", month = apr, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:32 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Uncertain data management has become crucial in many sensing and scientific applications. As user-defined functions (UDFs) become widely used in these applications, an important task is to capture result uncertainty for queries that evaluate UDFs on uncertain data. In this work, we provide a general framework for supporting UDFs on uncertain data. Specifically, we propose a learning approach based on Gaussian processes (GPs) to compute approximate output distributions of a UDF when evaluated on uncertain input, with guaranteed error bounds. We also devise an online algorithm to compute such output distributions, which employs a suite of optimizations to improve accuracy and performance. Our evaluation using both real-world and synthetic functions shows that our proposed GP approach can outperform the state-of-the-art sampling approach with up to two orders of magnitude improvement for a variety of UDFs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhu:2013:IAA, author = "Fanwei Zhu and Yuan Fang and Kevin Chen-Chuan Chang and Jing Ying", title = "Incremental and accuracy-aware {Personalized PageRank} through scheduled approximation", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "6", pages = "481--492", month = apr, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:32 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/pagerank.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As Personalized PageRank has been widely leveraged for ranking on a graph, the efficient computation of Personalized PageRank Vector (PPV) becomes a prominent issue. In this paper, we propose FastPPV, an approximate PPV computation algorithm that is incremental and accuracy-aware. Our approach hinges on a novel paradigm of scheduled approximation: the computation is partitioned and scheduled for processing in an ``organized'' way, such that we can gradually improve our PPV estimation in an incremental manner, and quantify the accuracy of our approximation at query time. Guided by this principle, we develop an efficient hub based realization, where we adopt the metric of hub-length to partition and schedule random walk tours so that the approximation error reduces exponentially over iterations. Furthermore, as tours are segmented by hubs, the shared substructures between different tours (around the same hub) can be reused to speed up query processing both within and across iterations. Finally, we evaluate FastPPV over two real-world graphs, and show that it not only significantly outperforms two state-of-the-art baselines in both online and offline phrases, but also scale well on larger graphs. In particular, we are able to achieve near-constant time online query processing irrespective of graph size.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zheng:2013:ESB, author = "Weiguo Zheng and Lei Zou and Yansong Feng and Lei Chen and Dongyan Zhao", title = "Efficient simrank-based similarity join over large graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "7", pages = "493--504", month = may, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:37 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graphs have been widely used to model complex data in many real-world applications. Answering vertex join queries over large graphs is meaningful and interesting, which can benefit friend recommendation in social networks and link prediction, etc. In this paper, we adopt ``SimRank'' to evaluate the similarity of two vertices in a large graph because of its generality. Note that ``SimRank'' is purely structure dependent and it does not rely on the domain knowledge. Specifically, we define a SimRank-based join (SRJ) query to find all the vertex pairs satisfying the threshold in a data graph $G$. In order to reduce the search space, we propose an estimated shortest-path distance based upper bound for SimRank scores to prune unpromising vertex pairs. In the verification, we propose a novel index, called $h$-go cover, to efficiently compute the SimRank score of a single vertex pair. Given a graph $G$, we only materialize the SimRank scores of a small proportion of vertex pairs (called $h$-go covers), based on which, the SimRank score of any vertex pair can be computed easily. In order to handle large graphs, we extend our technique to the partition-based framework. Thorough theoretical analysis and extensive experiments over both real and synthetic datasets confirm the efficiency and effectiveness of our solution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2013:PST, author = "Guimei Liu and Andre Suchitra and Limsoon Wong", title = "A performance study of three disk-based structures for indexing and querying frequent itemsets", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "7", pages = "505--516", month = may, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:37 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Frequent itemset mining is an important problem in the data mining area. Extensive efforts have been devoted to developing efficient algorithms for mining frequent itemsets. However, not much attention is paid on managing the large collection of frequent itemsets produced by these algorithms for subsequent analysis and for user exploration. In this paper, we study three structures for indexing and querying frequent itemsets: inverted files, signature files and CFP-tree. The first two structures have been widely used for indexing general set-valued data. We make some modifications to make them more suitable for indexing frequent itemsets. The CFP-tree structure is specially designed for storing frequent itemsets. We add a pruning technique based on length-2 frequent itemsets to make it more efficient for processing superset queries. We study the performance of the three structures in supporting five types of containment queries: exact match, subset/superset search and immediate subset/superset search. Our results show that no structure can outperform other structures for all the five types of queries on all the datasets. CFP-tree shows better overall performance than the other two structures.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yuan:2013:TFC, author = "Pingpeng Yuan and Pu Liu and Buwen Wu and Hai Jin and Wenya Zhang and Ling Liu", title = "{TripleBit}: a fast and compact system for large scale {RDF} data", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "7", pages = "517--528", month = may, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:37 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The volume of RDF data continues to grow over the past decade and many known RDF datasets have billions of triples. A grant challenge of managing this huge RDF data is how to access this big RDF data efficiently. A popular approach to addressing the problem is to build a full set of permutations of $ (S, P, O) $ indexes. Although this approach has shown to accelerate joins by orders of magnitude, the large space overhead limits the scalability of this approach and makes it heavyweight. In this paper, we present TripleBit, a fast and compact system for storing and accessing RDF data. The design of TripleBit has three salient features. First, the compact design of TripleBit reduces both the size of stored RDF data and the size of its indexes. Second, TripleBit introduces two auxiliary index structures, ID-Chunk bit matrix and ID-Predicate bit matrix, to minimize the cost of index selection during query evaluation. Third, its query processor dynamically generates an optimal execution ordering for join queries, leading to fast query execution and effective reduction on the size of intermediate results. Our experiments show that TripleBit outperforms RDF-3X, MonetDB, BitMat on LUBM, UniProt and BTC 2012 benchmark queries and it offers orders of mangnitude performance improvement for some complex join queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bajaj:2013:CSE, author = "Sumeet Bajaj and Radu Sion", title = "{CorrectDB}: {SQL} engine with practical query authentication", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "7", pages = "529--540", month = may, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:37 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Clients of outsourced databases need Query Authentication (QA) guaranteeing the integrity (correctness and completeness), and authenticity of the query results returned by potentially compromised providers. Existing results provide QA assurances for a limited class of queries by deploying several software cryptographic constructs. Here, we show that, to achieve QA, however, it is significantly cheaper and more practical to deploy server-hosted, tamper-proof co-processors, despite their higher acquisition costs. Further, this provides the ability to handle arbitrary queries. To reach this insight, we extensively survey existing QA work and identify interdependencies and efficiency relationships. We then introduce CorrectDB, a new DBMS with full QA assurances, leveraging server-hosted, tamper-proof, trusted hardware in close proximity to the outsourced data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2013:HSM, author = "Xin Liu and Kenneth Salem", title = "Hybrid storage management for database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "8", pages = "541--552", month = jun, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:42 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The use of flash-based solid state drives (SSDs) in storage systems is growing. Adding SSDs to a storage system not only raises the question of how to manage the SSDs, but also raises the question of whether current buffer pool algorithms will still work effectively. We are interested in the use of hybrid storage systems, consisting of SSDs and hard disk drives (HDDs), for database management. We present cost-aware replacement algorithms, which are aware of the difference in performance between SSDs and HDDs, for both the DBMS buffer pool and the SSDs. In hybrid storage systems, the physical access pattern to the SSDs depends on the management of the DBMS buffer pool. We studied the impact of buffer pool caching policies on SSD access patterns. Based on these studies, we designed a cost-adjusted caching policy to effectively manage the SSD. We implemented these algorithms in MySQL's InnoDB storage engine and used the TPC-C workload to demonstrate that these cost-aware algorithms outperform previous algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2013:SEO, author = "Eugene Wu and Samuel Madden", title = "{Scorpion}: explaining away outliers in aggregate queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "8", pages = "553--564", month = jun, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:42 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database users commonly explore large data sets by running aggregate queries that project the data down to a smaller number of points and dimensions, and visualizing the results. Often, such visualizations will reveal outliers that correspond to errors or surprising features of the input data set. Unfortunately, databases and visualization systems do not provide a way to work backwards from an outlier point to the common properties of the (possibly many) unaggregated input tuples that correspond to that outlier. We propose Scorpion, a system that takes a set of user-specified outlier points in an aggregate query result as input and finds predicates that explain the outliers in terms of properties of the input tuples that are used to compute the selected outlier results. Specifically, this explanation identifies predicates that, when applied to the input data, cause the outliers to disappear from the output. To find such predicates, we develop a notion of influence of a predicate on a given output, and design several algorithms that efficiently search for maximum influence predicates over the input data. We show that these algorithms can quickly find outliers in two real data sets (from a sensor deployment and a campaign finance data set), and run orders of magnitude faster than a naive search algorithm while providing comparable quality on a synthetic data set.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gupta:2013:RTQ, author = "Rajeev Gupta and Krithi Ramamritham and Mukesh Mohania", title = "Ratio threshold queries over distributed data sources", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "8", pages = "565--576", month = jun, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:42 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Continuous aggregation queries over dynamic data are used for real time decision making and timely business intelligence. In this paper we consider queries where a client wants to be notified if the ratio of two aggregates over distributed data crosses a specified threshold. Consider these scenarios: a mechanism designed to defend against distributed denial of service attacks may be triggered when the fraction of packets arriving to a subnet is more than 5\% of the total packets; or a distributed store chain withdraws its discount on luxury goods when sales of luxury goods constitute more than 20\% of the overall sales. The challenge in executing such ratio threshold queries (RTQs) lies in incurring the minimal amount of communication necessary for propagation of updates from data sources to the aggregator node where the client query is executed. We address this challenge by proposing schemes for converting the client ratio threshold condition into conditions on individual distributed data sources. Whenever the condition associated with a source is violated, the source pushes its data values to the aggregator, which in turn pulls data values from other sources to determine whether the client threshold condition is indeed violated. We present algorithms to minimize the number of source condition violations (i.e., the number of pushes) while ensuring that no violation of the client threshold condition is missed. Further, in case of a source condition violation, we propose efficient selective pulling algorithms for intelligently choosing additional sources whose data should be pulled by the aggregator. Using performance evaluation on synthetic and real traces of data updates we show that our algorithms result in up to an order of magnitude less number of messages compared to existing approaches in the literature.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deng:2013:CQR, author = "Ting Deng and Wenfei Fan", title = "On the complexity of query result diversification", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "8", pages = "577--588", month = jun, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:42 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query result diversification is a bi-criteria optimization problem for ranking query results. Given a database $D$, a query $Q$ and a positive integer $k$, it is to find a set of $k$ tuples from $ Q(D) $ such that the tuples are as relevant as possible to the query, and at the same time, as diverse as possible to each other. Subsets of $ Q(D) $ are ranked by an objective function defined in terms of relevance and diversity. Query result diversification has found a variety of applications in databases, information retrieval and operations research. This paper studies the complexity of result diversification for relational queries. We identify three problems in connection with query result diversification, to determine whether there exists a set of $k$ tuples that is ranked above a bound with respect to relevance and diversity, to assess the rank of a given $k$-element set, and to count how many $k$-element sets are ranked above a given bound. We study these problems for a variety of query languages and for three objective functions. We establish the upper and lower bounds of these problems, all matching, for both combined complexity and data complexity. We also investigate several special settings of these problems, identifying tractable cases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dutta:2013:SQF, author = "Sourav Dutta and Ankur Narang and Suman K. Bera", title = "Streaming quotient filter: a near optimal approximate duplicate detection approach for data streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "8", pages = "589--600", month = jun, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:42 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The unparalleled growth and popularity of the Internet coupled with the advent of diverse modern applications such as search engines, on-line transactions, climate warning systems, etc., has catered to an unprecedented expanse in the volume of data stored world-wide. Efficient storage, management, and processing of such massively exponential amount of data has emerged as a central theme of research in this direction. Detection and removal of redundancies and duplicates in real-time from such multi-trillion record-set to bolster resource and compute efficiency constitutes a challenging area of study. The infeasibility of storing the entire data from potentially unbounded data streams, with the need for precise elimination of duplicates calls for intelligent approximate duplicate detection algorithms. The literature hosts numerous works based on the well-known probabilistic bitmap structure, Bloom Filter and its variants. In this paper we propose a novel data structure, Streaming Quotient Filter, (SQF) for efficient detection and removal of duplicates in data streams. SQF intelligently stores the signatures of elements arriving on a data stream, and along with an eviction policy provides near zero false positive and false negative rates. We show that the near optimal performance of SQF is achieved with a very low memory requirement, making it ideal for real-time memory-efficient de-duplication applications having an extremely low false positive and false negative tolerance rates. We present detailed theoretical analysis of the working of SQF, providing a guarantee on its performance. Empirically, we compare SQF to alternate methods and show that the proposed method is superior in terms of memory and accuracy compared to the existing solutions. We also discuss Dynamic SQF for evolving streams and the parallel implementation of SQF.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Korn:2013:RSP, author = "Flip Korn and Barna Saha and Divesh Srivastava and Shanshan Ying", title = "On repairing structural problems in semi-structured data", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "9", pages = "601--612", month = jul, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:46 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Semi-structured data such as XML are popular for data interchange and storage. However, many XML documents have improper nesting where open --- and close-tags are unmatched. Since some semi-structured data (e.g., Latex) have a flexible grammar and since many XML documents lack an accompanying DTD or XSD, we focus on computing a syntactic repair via the edit distance. To solve this problem, we propose a dynamic programming algorithm which takes cubic time. While this algorithm is not scalable, well-formed substrings of the data can be pruned to enable faster computation. Unfortunately, there are still cases where the dynamic program could be very expensive; hence, we give branch-and-bound algorithms based on various combinations of two heuristics, called MinCost and MaxBenefit, that trade off between accuracy and efficiency. Finally, we experimentally demonstrate the performance of these algorithms on real data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Manshadi:2013:DAL, author = "Faraz Makari Manshadi and Baruch Awerbuch and Rainer Gemulla and Rohit Khandekar and Juli{\'a}n Mestre and Mauro Sozio", title = "A distributed algorithm for large-scale generalized matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "9", pages = "613--624", month = jul, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:46 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Generalized matching problems arise in a number of applications, including computational advertising, recommender systems, and trade markets. Consider, for example, the problem of recommending multimedia items (e.g., DVDs) to users such that (1) users are recommended items that they are likely to be interested in, (2) every user gets neither too few nor too many recommendations, and (3) only items available in stock are recommended to users. State-of-the-art matching algorithms fail at coping with large real-world instances, which may involve millions of users and items. We propose the first distributed algorithm for computing near-optimal solutions to large-scale generalized matching problems like the one above. Our algorithm is designed to run on a small cluster of commodity nodes (or in a MapReduce environment), has strong approximation guarantees, and requires only a poly-logarithmic number of passes over the input. In particular, we propose a novel distributed algorithm to approximately solve mixed packing-covering linear programs, which include but are not limited to generalized matching problems. Experiments on real-world and synthetic data suggest that a practical variant of our algorithm scales to very large problem sizes and can be orders of magnitude faster than alternative approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Geerts:2013:LDC, author = "Floris Geerts and Giansalvatore Mecca and Paolo Papotti and Donatello Santoro", title = "The {LLUNATIC} data-cleaning framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "9", pages = "625--636", month = jul, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:46 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data-cleaning (or data-repairing) is considered a crucial problem in many database-related tasks. It consists in making a database consistent with respect to a set of given constraints. In recent years, repairing methods have been proposed for several classes of constraints. However, these methods rely on ad hoc decisions and tend to hard-code the strategy to repair conflicting values. As a consequence, there is currently no general algorithm to solve database repairing problems that involve different kinds of constraints and different strategies to select preferred values. In this paper we develop a uniform framework to solve this problem. We propose a new semantics for repairs, and a chase-based algorithm to compute minimal solutions. We implemented the framework in a DBMS-based prototype, and we report experimental results that confirm its good scalability and superior quality in computing repairs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Psaroudakis:2013:SDW, author = "Iraklis Psaroudakis and Manos Athanassoulis and Anastasia Ailamaki", title = "Sharing data and work across concurrent analytical queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "9", pages = "637--648", month = jul, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:46 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Today's data deluge enables organizations to collect massive data, and analyze it with an ever-increasing number of concurrent queries. Traditional data warehouses (DW) face a challenging problem in executing this task, due to their query-centric model: each query is optimized and executed independently. This model results in high contention for resources. Thus, modern DW depart from the query-centric model to execution models involving sharing of common data and work. Our goal is to show when and how a DW should employ sharing. We evaluate experimentally two sharing methodologies, based on their original prototype systems, that exploit work sharing opportunities among concurrent queries at run-time: Simultaneous Pipelining (SP), which shares intermediate results of common sub-plans, and Global Query Plans (GQP), which build and evaluate a single query plan with shared operators. First, after a short review of sharing methodologies, we show that SP and GQP are orthogonal techniques. SP can be applied to shared operators of a GQP, reducing response times by 20\%--48\% in workloads with numerous common sub-plans. Second, we corroborate previous results on the negative impact of SP on performance for cases of low concurrency. We attribute this behavior to a bottleneck caused by the push-based communication model of SP. We show that pull-based communication for SP eliminates the overhead of sharing altogether for low concurrency, and scales better on multi-core machines than push-based SP, further reducing response times by 82\%--86\% for high concurrency. Third, we perform an experimental analysis of SP, GQP and their combination, and show when each one is beneficial. We identify a trade-off between low and high concurrency. In the former case, traditional query-centric operators with SP perform better, while in the latter case, GQP with shared operators enhanced by SP give the best results.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shang:2013:SOA, author = "Haichuan Shang and Masaru Kitsuregawa", title = "Skyline operator on anti-correlated distributions", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "9", pages = "649--660", month = jul, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:46 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Finding the skyline in a multi-dimensional space is relevant to a wide range of applications. The skyline operator over a set of $d$-dimensional points selects the points that are not dominated by any other point on all dimensions. Therefore, it provides a minimal set of candidates for the users to make their personal trade-off among all optimal solutions. The existing algorithms establish both the worst case complexity by discarding distributions and the average case complexity by assuming dimensional independence. However, the data in the real world is more likely to be anti-correlated. The cardinality and complexity analysis on dimensionally independent data is meaningless when dealing with anti-correlated data. Furthermore, the performance of the existing algorithms becomes impractical on anti-correlated data. In this paper, we establish a cardinality model for anti-correlated distributions. We propose an accurate polynomial estimation for the expected value of the skyline cardinality. Because the high skyline cardinality downgrades the performance of most existing algorithms on anti-correlated data, we further develop a determination and elimination framework which extends the well-adopted elimination strategy. It achieves remarkable effectiveness and efficiency. The comprehensive experiments on both real datasets and benchmark synthetic datasets demonstrate that our approach significantly outperforms the state-of-the-art algorithms under a wide range of settings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mahmoud:2013:LLM, author = "Hatem Mahmoud and Faisal Nawab and Alexander Pucher and Divyakant Agrawal and Amr {El Abbadi}", title = "Low-latency multi-datacenter databases using replicated commit", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "9", pages = "661--672", month = jul, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:46 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Web service providers have been using NoSQL datastores to provide scalability and availability for globally distributed data at the cost of sacrificing transactional guarantees. Recently, major web service providers like Google have moved towards building storage systems that provide ACID transactional guarantees for globally distributed data. For example, the newly published system, Spanner, uses Two-Phase Commit and Two-Phase Locking to provide atomicity and isolation for globally distributed data, running on top of Paxos to provide fault-tolerant log replication. We show in this paper that it is possible to provide the same ACID transactional guarantees for multi-datacenter databases with fewer cross-datacenter communication trips, compared to replicated logging. Instead of replicating the transactional log, we replicate the commit operation itself, by running Two-Phase Commit multiple times in different datacenters and using Paxos to reach consensus among datacenters as to whether the transaction should commit. Doing so not only replaces several inter-datacenter communication trips with intra-datacenter communication trips, but also allows us to integrate atomic commitment and isolation protocols with consistent replication protocols to further reduce the number of cross-datacenter communication trips needed for consistent replication; for example, by eliminating the need for an election phase in Paxos. We analyze our approach in terms of communication trips to compare it against the log replication approach, then we conduct an extensive experimental study to compare the performance and scalability of both approaches under various multi-datacenter setups.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chi:2013:DBQ, author = "Yun Chi and Hakan Hac{\'\i}g{\"u}m{\"u}s and Wang-Pin Hsiung and Jeffrey F. Naughton", title = "Distribution-based query scheduling", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "9", pages = "673--684", month = jul, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:46 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query scheduling, a fundamental problem in database management systems, has recently received a renewed attention, perhaps in part due to the rise of the ``database as a service'' (DaaS) model for database deployment. While there has been a great deal of work investigating different scheduling algorithms, there has been comparatively little work investigating what the scheduling algorithms can or should know about the queries to be scheduled. In this work, we investigate the efficacy of using histograms describing the distribution of likely query execution times as input to the query scheduler. We propose a novel distribution-based scheduling algorithm, Shepherd, and show that Shepherd substantially outperforms state-of-the-art point-based methods through extensive experimentation with both synthetic and TPC workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2013:MQT, author = "Wenfei Fan and Floris Geerts and Frank Neven", title = "Making queries tractable on big data with preprocessing: through the eyes of complexity theory", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "9", pages = "685--696", month = jul, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:46 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A query class is traditionally considered tractable if there exists a polynomial-time (PTIME) algorithm to answer its queries. When it comes to big data, however, PTIME algorithms often become infeasible in practice. A traditional and effective approach to coping with this is to preprocess data off-line, so that queries in the class can be subsequently evaluated on the data efficiently. This paper aims to provide a formal foundation for this approach in terms of computational complexity. (1) We propose a set of $ \Pi $-tractable queries, denoted by $ \Pi T Q^0 $, to characterize classes of queries that can be answered in parallel poly-logarithmic time (NC) after PTIME preprocessing. (2) We show that several natural query classes are $ \Pi $-tractable and are feasible on big data. (3) We also study a set $ \Pi T Q $ of query classes that can be effectively converted to $ \Pi $-tractable queries by refactorizing its data and queries for preprocessing. We introduce a form of NC reductions to characterize such conversions. (4) We show that a natural query class is complete for $ \Pi T Q $. (5) We also show that $ \Pi T Q^0 \subset P $ unless $ P = {\rm NC} $, i.e., the set $ \Pi T Q^0 $ of all $ \Pi $-tractable queries is properly contained in the set $P$ of all PTIME queries. Nonetheless, $ \Pi T Q = P $, i.e., all PTIME query classes can be made $ \Pi $-tractable via proper refactorizations. This work is a step towards understanding the tractability of queries in the context of big data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kaplan:2013:APQ, author = "Haim Kaplan and Ilia Lotosh and Tova Milo and Slava Novgorodov", title = "Answering planning queries with the crowd", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "9", pages = "697--708", month = jul, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:46 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent research has shown that crowd sourcing can be used effectively to solve problems that are difficult for computers, e.g., optical character recognition and identification of the structural configuration of natural proteins. In this paper we propose to use the power of the crowd to address yet another difficult problem that frequently occurs in a daily life --- answering planning queries whose output is a sequence of objects/actions, when the goal, i.e, the notion of ``best output'', is hard to formalize. For example, planning the sequence of places/attractions to visit in the course of a vacation, where the goal is to enjoy the resulting vacation the most, or planning the sequence of courses to take in an academic schedule planning, where the goal is to obtain solid knowledge of a given subject domain. Such goals may be easily understandable by humans, but hard or even impossible to formalize for a computer. We present a novel algorithm for efficiently harnessing the crowd to assist in answering such planning queries. The algorithm builds the desired plans incrementally, choosing at each step the 'best' questions so that the overall number of questions that need to be asked is minimized. We prove the algorithm to be optimal within its class and demonstrate experimentally its effectiveness and efficiency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Heimel:2013:HOP, author = "Max Heimel and Michael Saecker and Holger Pirk and Stefan Manegold and Volker Markl", title = "Hardware-oblivious parallelism for in-memory column-stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "9", pages = "709--720", month = jul, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:46 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The multi-core architectures of today's computer systems make parallelism a necessity for performance critical applications. Writing such applications in a generic, hardware-oblivious manner is a challenging problem: Current database systems thus rely on labor-intensive and error-prone manual tuning to exploit the full potential of modern parallel hardware architectures like multi-core CPUs and graphics cards. We propose an alternative design for a parallel database engine, based on a single set of hardware-oblivious operators, which are compiled down to the actual hardware at runtime. This design reduces the development overhead for parallel database engines, while achieving competitive performance to hand-tuned systems. We provide a proof-of-concept for this design by integrating operators written using the parallel programming framework OpenCL into the open-source database MonetDB. Following this approach, we achieve efficient, yet highly portable parallel code without the need for optimization by hand. We evaluated our implementation against MonetDB using TPC-H derived queries and observed a performance that rivals that of MonetDB's query execution on the CPU and surpasses it on the GPU. In addition, we show that the same set of operators runs nearly unchanged on a GPU, demonstrating the feasibility of our approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Thonangi:2013:PDR, author = "Risi Thonangi and Jun Yang", title = "Permuting data on random-access block storage", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "9", pages = "721--732", month = jul, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:46 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Permutation is a fundamental operator for array data, with applications in, for example, changing matrix layouts and reorganizing data cubes. We consider the problem of permuting large quantities of data stored on secondary storage that supports fast random block accesses, such as solid state drives and distributed key--value stores. Faster random accesses open up interesting new opportunities for permutation. While external merge sort has often been used for permutation, it is an overkill that fails to exploit the property of permutation fully and carries unnecessary overhead in storing and comparing keys. We propose faster algorithms with lower memory requirements for a large, useful class of permutations. We also tackle practical challenges that traditional permutation algorithms have not dealt with, such as exploiting random block accesses more aggressively, considering the cost asymmetry between reads and writes, and handling arbitrary data dimension sizes (as opposed to perfect powers often assumed by previous work). As a result, our algorithms are faster and more broadly applicable.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Stoica:2013:IFW, author = "Radu Stoica and Anastasia Ailamaki", title = "Improving flash write performance by using update frequency", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "9", pages = "733--744", month = jul, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:46 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Solid-state drives (SSDs) are quickly becoming the default storage medium as the cost of NAND flash memory continues to drop. However, flash memory introduces new challenges, as data cannot be eciently updated in-place. To overcome the technology's limitations, SSDs incorporate a software Flash Translation Layer (FTL) that implements out-of-place updates, typically by storing data in a log-structured fashion. Despite a large number of existing FTL algorithms, SSD performance, predictability, and lifetime remain an issue, especially for the write-intensive workloads specific to database applications. In this paper, we show how to design FTLs that are more efficient by using the I/O write skew to guide data placement on flash memory. We model the relationship between data placement and write performance for basic I/O write patterns and detail the most important concepts of writing to flash memory: (i) the trade-o between the extra capacity available and write overhead, (ii) the benefit of adapting data placement to write skew, (iii) the impact of the cleaning policy, and (iv) how to estimate the best achievable write performance for a given I/O workload. Based on the findings of the theoretical model, we propose a new principled data placement algorithm that can be incorporated into existing FTLs. We show the benefits of our data placement algorithm when running micro-benchmarks and real database I/O traces: our data placement algorithm reduces write overhead by 20\%--75\% when compared to state-of-art techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2013:EID, author = "Lu Li and Chee-Yong Chan", title = "Efficient indexing for diverse query results", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "9", pages = "745--756", month = jul, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:46 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper examines the problem of computing diverse query results which is useful for browsing search results in online shopping applications. The search results are diversified wrt a sequence of output attributes (termed $d$-order) where an attribute that appears earlier in the $d$-order has higher priority for diversification. We present a new indexing technique, $D$-Index, to efficiently compute diverse query results for queries with static or dynamic $d$-orders. Our performance evaluation demonstrates that our $D$-Index outperforms the state-of-the-art techniques developed for queries with static or dynamic $d$-orders.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2013:RUS, author = "Chen Jason Zhang and Lei Chen and H. V. Jagadish and Chen Caleb Cao", title = "Reducing uncertainty of schema matching via crowdsourcing", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "9", pages = "757--768", month = jul, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:46 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Schema matching is a central challenge for data integration systems. Automated tools are often uncertain about schema matchings they suggest, and this uncertainty is inherent since it arises from the inability of the schema to fully capture the semantics of the represented data. Human common sense can often help. Inspired by the popularity and the success of easily accessible crowdsourcing platforms, we explore the use of crowdsourcing to reduce the uncertainty of schema matching. Since it is typical to ask simple questions on crowdsourcing platforms, we assume that each question, namely Correspondence Correctness Question (CCQ), is to ask the crowd to decide whether a given correspondence should exist in the correct matching. We propose frameworks and efficient algorithms to dynamically manage the CCQs, in order to maximize the uncertainty reduction within a limited budget of questions. We develop two novel approaches, namely ``Single CCQ'' and ``Multiple CCQ'', which adaptively select, publish and manage the questions. We verified the value of our solutions with simulation and real implementation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2013:TCI, author = "Bin Yang and Chenjuan Guo and Christian S. Jensen", title = "Travel cost inference from sparse, spatio temporally correlated time series using {Markov} models", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "9", pages = "769--780", month = jul, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:46 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The monitoring of a system can yield a set of measurements that can be modeled as a collection of time series. These time series are often sparse, due to missing measurements, and spatiotemporally correlated, meaning that spatially close time series exhibit temporal correlation. The analysis of such time series offers insight into the underlying system and enables prediction of system behavior. While the techniques presented in the paper apply more generally, we consider the case of transportation systems and aim to predict travel cost from GPS tracking data from probe vehicles. Specifically, each road segment has an associated travel-cost time series, which is derived from GPS data. We use spatio-temporal hidden Markov models (STHMM) to model correlations among different traffic time series. We provide algorithms that are able to learn the parameters of an STHMM while contending with the sparsity, spatio-temporal correlation, and heterogeneity of the time series. Using the resulting STHMM, near future travel costs in the transportation network, e.g., travel time or greenhouse gas emissions, can be inferred, enabling a variety of routing services, e.g., eco-routing. Empirical studies with a substantial GPS data set offer insight into the design properties of the proposed framework and algorithms, demonstrating the effectiveness and efficiency of travel cost inferencing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Park:2013:QOC, author = "Hyunjung Park and Jennifer Widom", title = "Query optimization over crowdsourced data", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "10", pages = "781--792", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:50 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Deco is a comprehensive system for answering declarative queries posed over stored relational data together with data obtained on-demand from the crowd. In this paper we describe Deco's cost-based query optimizer, building on Deco's data model, query language, and query execution engine presented earlier. Deco's objective in query optimization is to find the best query plan to answer a query, in terms of estimated monetary cost. Deco's query semantics and plan execution strategies require several fundamental changes to traditional query optimization. Novel techniques incorporated into Deco's query optimizer include a cost model distinguishing between ``free'' existing data versus paid new data, a cardinality estimation algorithm coping with changes to the database state during query execution, and a plan enumeration algorithm maximizing reuse of common subplans in a setting that makes reuse challenging. We experimentally evaluate Deco's query optimizer, focusing on the accuracy of cost estimation and the efficiency of plan enumeration.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2013:DAD, author = "Yang Wang and Peng Wang and Jian Pei and Wei Wang and Sheng Huang", title = "A data-adaptive and dynamic segmentation index for whole matching on time series", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "10", pages = "793--804", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:50 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Similarity search on time series is an essential operation in many applications. In the state-of-the-art methods, such as the R-tree based methods, SAX and iSAX, time series are by default divided into equi-length segments globally, that is, all time series are segmented in the same way. Those methods then focus on how to approximate or symbolize the segments and construct indexes. In this paper, we make an important observation: global segmentation of all time series may incur unnecessary cost in space and time for indexing time series. We develop DSTree, a data adaptive and dynamic segmentation index on time series. In addition to savings in space and time, our new index can provide tight upper and lower bounds on distances between time series. An extensive empirical study shows that our new index DSTree supports time series similarity search effectively and efficiently.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bronzi:2013:EIP, author = "Mirko Bronzi and Valter Crescenzi and Paolo Merialdo and Paolo Papotti", title = "Extraction and integration of partially overlapping web sources", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "10", pages = "805--816", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:50 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present an unsupervised approach for harvesting the data exposed by a set of structured and partially overlapping data-intensive web sources. Our proposal comes within a formal framework tackling two problems: the data extraction problem, to generate extraction rules based on the input websites, and the data integration problem, to integrate the extracted data in a unified schema. We introduce an original algorithm, WEIR, to solve the stated problems and formally prove its correctness. WEIR leverages the overlapping data among sources to make better decisions both in the data extraction (by pruning rules that do not lead to redundant information) and in the data integration (by reflecting local properties of a source over the mediated schema). Along the way, we characterize the amount of redundancy needed by our algorithm to produce a solution, and present experimental results to show the benefits of our approach with respect to existing solutions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yuan:2013:YYP, author = "Yuan Yuan and Rubao Lee and Xiaodong Zhang", title = "The {Yin} and {Yang} of processing data warehousing queries on {GPU} devices", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "10", pages = "817--828", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:50 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database community has made significant research efforts to optimize query processing on GPUs in the past few years. However, we can hardly find that GPUs have been truly adopted in major warehousing production systems. Preparing to merge GPUs to the warehousing systems, we have identified and addressed several critical issues in a three-dimensional study of warehousing queries on GPUs by varying query characteristics, software techniques, and GPU hardware configurations. We also propose an analytical model to understand and predict the query performance on GPUs. Based on our study, we present our performance insights for warehousing query execution on GPUs. The objective of our work is to provide a comprehensive guidance for GPU architects, software system designers, and database practitioners to narrow the speed gap between the GPU kernel execution (the fast mode) and data transfer to prepare GPU execution (the slow mode) for high performance in processing data warehousing queries. The GPU query engine developed in this work is open source to the public.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yuan:2013:MIG, author = "Dayu Yuan and Prasenjit Mitra and C. Lee Giles", title = "Mining and indexing graphs for supergraph search", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "10", pages = "829--840", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:50 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study supergraph search (SPS), that is, given a query graph $q$ and a graph database $G$ that contains a collection of graphs, return graphs that have $q$ as a supergraph from $G$. SPS has broad applications in bioinformatics, cheminformatics and other scientific and commercial fields. Determining whether a graph is a subgraph (or supergraph) of another is an NP-complete problem. Hence, it is intractable to compute SPS for large graph databases. Two separate indexing methods, a ``filter + verify''-based method and a ``prefix-sharing''-based method, have been studied to efficiently compute SPS. To implement the above two methods, subgraph patterns are mined from the graph database to build an index. Those subgraphs are mined to optimize either the filtering gain or the prefix-sharing gain. However, no single subgraph-mining algorithm considers both gains. This work is the first one to mine subgraphs to optimize both the filtering gain and the prefix-sharing gain while processing SPS queries. First, we show that the subgraph-mining problem is NP-hard. Then, we propose two polynomial-time algorithms to solve the problem with an approximation ratio of $ 1 - 1 / e $ and $ 1 / 4 $ respectively. In addition, we construct a lattice-like index, LW-index, to organize the selected subgraph patterns for fast index-lookup. Our experiments show that our approach improves the query processing time for SPS queries by a factor of 3 to 10.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2013:ERM, author = "Jianmin Wang and Shaoxu Song and Xiaochen Zhu and Xuemin Lin", title = "Efficient recovery of missing events", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "10", pages = "841--852", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:50 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "For various entering and transmission issues raised by human or system, missing events often occur in event data, which record execution logs of business processes. Without recovering these missing events, applications such as provenance analysis or complex event processing built upon event data are not reliable. Following the minimum change discipline in improving data quality, it is also rational to find a recovery that minimally differs from the original data. Existing recovery approaches fall short of efficiency owing to enumerating and searching over all the possible sequences of events. In this paper, we study the efficient techniques for recovering missing events. According to our theoretical results, the recovery problem is proved to be NP-hard. Nevertheless, we are able to concisely represent the space of event sequences in a branching framework. Advanced indexing and pruning techniques are developed to further improve the recovery efficiency. Our proposed efficient techniques make it possible to find top-$k$ recoveries. The experimental results demonstrate that our minimum recovery approach achieves high accuracy, and significantly outperforms the state-of-the-art technique for up to 5 orders of magnitudes improvement in time performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ren:2013:HAA, author = "Kai Ren and YongChul Kwon and Magdalena Balazinska and Bill Howe", title = "{Hadoop}'s adolescence: an analysis of {Hadoop} usage in scientific workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "10", pages = "853--864", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:50 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We analyze Hadoop workloads from three di?erent research clusters from a user-centric perspective. The goal is to better understand data scientists' use of the system and how well the use of the system matches its design. Our analysis suggests that Hadoop usage is still in its adolescence. We see underuse of Hadoop features, extensions, and tools. We see significant diversity in resource usage and application styles, including some interactive and iterative workloads, motivating new tools in the ecosystem. We also observe significant opportunities for optimizations of these workloads. We find that job customization and configuration are used in a narrow scope, suggesting the future pursuit of automatic tuning systems. Overall, we present the first user-centered measurement study of Hadoop and find significant opportunities for improving its efficient use for data scientists.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mansour:2013:RSE, author = "Essam Mansour and Ahmed El-Roby and Panos Kalnis and Aron Ahmadia and Ashraf Aboulnaga", title = "{RACE}: a scalable and elastic parallel system for discovering repeats in very long sequences", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "10", pages = "865--876", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:50 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A wide range of applications, including bioinformatics, time series, and log analysis, depend on the identification of repetitions in very long sequences. The problem of finding maximal pairs subsumes most important types of repetition-finding tasks. Existing solutions require both the input sequence and its index (typically an order of magnitude larger than the input) to fit in memory. Moreover, they are serial algorithms with long execution time. Therefore, they are limited to small datasets, despite the fact that modern applications demand orders of magnitude longer sequences. In this paper we present RACE, a parallel system for finding maximal pairs in very long sequences. RACE supports parallel execution on stand-alone multicore systems, in addition to scaling to thousands of nodes on clusters or supercomputers. RACE does not require the input or the index to fit in memory; therefore, it supports very long sequences with limited memory. Moreover, it uses a novel array representation that allows for cache-efficient implementation. RACE is particularly suitable for the cloud (e.g., Amazon EC2) because, based on availability, it can scale elastically to more or fewer machines during its execution. Since scaling out introduces overheads, mainly due to load imbalance, we propose a cost model to estimate the expected speedup, based on statistics gathered through sampling. The model allows the user to select the appropriate combination of cloud resources based on the provider's prices and the required deadline. We conducted extensive experimental evaluation with large real datasets and large computing infrastructures. In contrast to existing methods, RACE can handle the entire human genome on a typical desktop computer with 16GB RAM. Moreover, for a problem that takes 10 hours of serial execution, RACE finishes in 28 seconds using 2,048 nodes on an IBM BlueGene/P supercomputer.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Levandoski:2013:LCS, author = "Justin Levandoski and David Lomet and Sudipta Sengupta", title = "{LLAMA}: a cache\slash storage subsystem for modern hardware", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "10", pages = "877--888", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:50 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "LLAMA is a subsystem designed for new hardware environments that supports an API for page-oriented access methods, providing both cache and storage management. Caching (CL) and storage (SL) layers use a common mapping table that separates a page's logical and physical location. CL supports data updates and management updates (e.g., for index re-organization) via latch-free compare-and-swap atomic state changes on its mapping table. SL uses the same mapping table to cope with page location changes produced by log structuring on every page flush. To demonstrate LLAMA's suitability, we tailored our latch-free Bw-tree implementation to use LLAMA. The Bw-tree is a B-tree style index. Layered on LLAMA, it has higher performance and scalability using real workloads compared with BerkeleyDB's B-tree, which is known for good performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{He:2013:RCP, author = "Jiong He and Mian Lu and Bingsheng He", title = "Revisiting co-processing for hash joins on the coupled {CPU--GPU} architecture", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "10", pages = "889--900", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:50 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query co-processing on graphics processors (GPUs) has become an effective means to improve the performance of main memory databases. However, the relatively low bandwidth and high latency of the PCI-e bus are usually bottleneck issues for co-processing. Recently, coupled CPU-GPU architectures have received a lot of attention, e.g. AMD APUs with the CPU and the GPU integrated into a single chip. That opens up new opportunities for optimizing query co-processing. In this paper, we experimentally revisit hash joins, one of the most important join algorithms for main memory databases, on a coupled CPU-GPU architecture. Particularly, we study the fine-grained co-processing mechanisms on hash joins with and without partitioning. The co-processing outlines an interesting design space. We extend existing cost models to automatically guide decisions on the design space. Our experimental results on a recent AMD APU show that (1) the coupled architecture enables fine-grained co-processing and cache reuses, which are inefficient on discrete CPU-GPU architectures; (2) the cost model can automatically guide the design and tuning knobs in the design space; (3) fine-grained co-processing achieves up to 53\%, 35\% and 28\% performance improvement over CPU-only, GPU-only and conventional CPU-GPU co-processing, respectively. We believe that the insights and implications from this study are initial yet important for further research on query co-processing on coupled CPU-GPU architectures.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qiao:2013:TKN, author = "Miao Qiao and Lu Qin and Hong Cheng and Jeffrey Xu Yu and Wentao Tian", title = "Top-$k$ nearest keyword search on large graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "10", pages = "901--912", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:50 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "It is quite common for networks emerging nowadays to have labels or textual contents on the nodes. On such networks, we study the problem of top-$k$ nearest keyword ($k$-NK) search. In a network $G$ modeled as an undirected graph, each node is attached with zero or more keywords, and each edge is assigned with a weight measuring its length. Given a query node $q$ in $G$ and a keyword $ \lambda $, a $k$-NK query seeks $k$ nodes which contain $ \lambda $ and are nearest to $q$. $k$-NK is not only useful as a stand-alone query but also as a building block for tackling complex graph pattern matching problems. The key to an accurate $k$-NK result is a precise shortest distance estimation in a graph. Based on the latest distance oracle technique, we build a shortest path tree for a distance oracle and use the tree distance as a more accurate estimation. With such representation, the original $k$-NK query on a graph can be reduced to answering the query on a set of trees and then assembling the results obtained from the trees. We propose two efficient algorithms to report the exact $k$-NK result on a tree. One is query time optimized for a scenario when a small number of result nodes are of interest to users. The other handles $k$-NK queries for an arbitrarily large $k$ efficiently. In obtaining a $k$-NK result on a graph from that on trees, a global storage technique is proposed to further reduce the index size and the query time. Extensive experimental results conform with our theoretical findings, and demonstrate the effectiveness and efficiency of our $k$-NK algorithms on large real graphs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Armenatzoglou:2013:GFG, author = "Nikos Armenatzoglou and Stavros Papadopoulos and Dimitris Papadias", title = "A general framework for geo-social query processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "10", pages = "913--924", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:50 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The proliferation of GPS-enabled mobile devises and the popularity of social networking have recently led to the rapid growth of Geo-Social Networks (GeoSNs). GeoSNs have created a fertile ground for novel location-based social interactions and advertising. These can be facilitated by GeoSN queries, which extract useful information combining both the social relationships and the current location of the users. This paper constitutes the first systematic work on GeoSN query processing. We propose a general framework that offers flexible data management and algorithmic design. Our architecture segregates the social, geographical and query processing modules. Each GeoSN query is processed via a transparent combination of primitive queries issued to the social and geographical modules. We demonstrate the power of our framework by introducing several ``basic'' and ``advanced'' query types, and devising various solutions for each type. Finally, we perform an exhaustive experimental evaluation with real and synthetic datasets, based on realistic implementations with both commercial software (such as MongoDB) and state-of-the-art research methods. Our results confirm the viability of our framework in typical large-scale GeoSNs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2013:TPQ, author = "Wentao Wu and Yun Chi and Hakan Hac{\'\i}g{\"u}m{\"u}s and Jeffrey F. Naughton", title = "Towards predicting query execution time for concurrent and dynamic database workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "10", pages = "925--936", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:50 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Predicting query execution time is crucial for many database management tasks including admission control, query scheduling, and progress monitoring. While a number of recent papers have explored this problem, the bulk of the existing work either considers prediction for a single query, or prediction for a static workload of concurrent queries, where by ``static'' we mean that the queries to be run are fixed and known. In this paper, we consider the more general problem of dynamic concurrent workloads. Unlike most previous work on query execution time prediction, our proposed framework is based on analytic modeling rather than machine learning. We first use the optimizer's cost model to estimate the I/O and CPU requirements for each pipeline of each query in isolation, and then use a combination queueing model and buffer pool model that merges the I/O and CPU requests from concurrent queries to predict running times. We compare the proposed approach with a machine-learning based approach that is a variant of previous work. Our experiments show that our analytic-model based approach can lead to competitive and often better prediction accuracy than its machine-learning based counterpart.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Garofalakis:2013:SBG, author = "Minos Garofalakis and Daniel Keren and Vasilis Samoladas", title = "Sketch-based geometric monitoring of distributed stream queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "10", pages = "937--948", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:50 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Emerging large-scale monitoring applications rely on continuous tracking of complex data-analysis queries over collections of massive, physically-distributed data streams. Thus, in addition to the space- and time-efficiency requirements of conventional stream processing (at each remote monitor site), effective solutions also need to guarantee communication efficiency (over the underlying communication network). The complexity of the monitored query adds to the difficulty of the problem --- this is especially true for nonlinear queries (e.g., joins), where no obvious solutions exist for distributing the monitor condition across sites. The recently proposed geometric method offers a generic methodology for splitting an arbitrary (non-linear) global threshold-monitoring task into a collection of local site constraints; still, the approach relies on maintaining the complete stream(s) at each site, thus raising serious efficiency concerns for massive data streams. In this paper, we propose novel algorithms for efficiently tracking a broad class of complex aggregate queries in such distributed-streams settings. Our tracking schemes rely on a novel combination of the geometric method with compact sketch summaries of local data streams, and maintain approximate answers with provable error guarantees, while optimizing space and processing costs at each remote site and communication cost across the network. One of our key technical insights for the effective use of the geometric method lies in exploiting a much lower-dimensional space for monitoring the sketch-based estimation query. Due to the complex, highly nonlinear nature of these estimates, efficiently monitoring the local geometric constraints poses challenging algorithmic issues for which we propose novel solutions. Experimental results on real-life data streams verify the effectiveness of our approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Long:2013:DPT, author = "Cheng Long and Raymond Chi-Wing Wong and H. V. Jagadish", title = "Direction-preserving trajectory simplification", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "10", pages = "949--960", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:50 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Trajectories of moving objects are collected in many applications. Raw trajectory data is typically very large, and has to be simplified before use. In this paper, we introduce the notion of direction-preserving trajectory simplification, and show both analytically and empirically that it can support a broader range of applications than traditional position-preserving trajectory simplification. We present a polynomial-time algorithm for optimal direction-preserving simplification, and another approximate algorithm with a quality guarantee. Extensive experimental evaluation with real trajectory data shows the benefit of the new techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bruno:2013:CCS, author = "Nicolas Bruno and Sapna Jain and Jingren Zhou", title = "Continuous cloud-scale query optimization and processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "961--972", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Massive data analysis in cloud-scale data centers plays a crucial role in making critical business decisions. High-level scripting languages free developers from understanding various system trade-offs, but introduce new challenges for query optimization. One key optimization challenge is missing accurate data statistics, typically due to massive data volumes and their distributed nature, complex computation logic, and frequent usage of user-defined functions. In this paper we propose novel techniques to adapt query processing in the Scope system, the cloud-scale computation environment in Microsoft Online Services. We continuously monitor query execution, collect actual runtime statistics, and adapt parallel execution plans as the query executes. We discuss similarities and differences between our approach and alternatives proposed in the context of traditional centralized systems. Experiments on large-scale Scope production clusters show that the proposed techniques systematically solve the challenge of missing/inaccurate data statistics, detect and resolve partition skew and plan structure, and improve query latency by a few folds for real workloads. Although we focus on optimizing high-level languages, the same ideas are also applicable for MapReduce systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cherniak:2013:OSB, author = "Andrii Cherniak and Huma Zaidi and Vladimir Zadorozhny", title = "Optimization strategies for {A\slash B} testing on {HADOOP}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "973--984", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this work, we present a set of techniques that considerably improve the performance of executing concurrent MapReduce jobs. Our proposed solution relies on proper resource allocation for concurrent Hive jobs based on data dependency, inter-query optimization and modeling of Hadoop cluster load. To the best of our knowledge, this is the first work towards Hive/MapReduce job optimization which takes Hadoop cluster load into consideration. We perform an experimental study that demonstrates 233\% reduction in execution time for concurrent vs sequential execution schema. We report up to 40\% extra reduction in execution time for concurrent job execution after resource usage optimization. The results reported in this paper were obtained in a pilot project to assess the feasibility of migrating A/B testing from Teradata + SAS analytics infrastructure to Hadoop. This work was performed on eBay production Hadoop cluster.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Elmeleegy:2013:POS, author = "Khaled Elmeleegy", title = "{Piranha}: optimizing short jobs in {Hadoop}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "985--996", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cluster computing has emerged as a key parallel processing platform for large scale data. All major internet companies use it as their major central processing platform. One of cluster computing's most popular examples is MapReduce and its open source implementation Hadoop. These systems were originally designed for batch and massive-scale computations. Interestingly, over time their production workloads have evolved into a mix of a small fraction of large and long-running jobs and a much bigger fraction of short jobs. This came about because these systems end up being used as data warehouses, which store most of the data sets and attract ad hoc, short, data-mining queries. Moreover, the availability of higher level query languages that operate on top of these cluster systems proliferated these ad hoc queries. Since existing systems were not designed for short, latency-sensistive jobs, short interactive jobs suffer from poor response times. In this paper, we present Piranha--a system for optimizing short jobs on Hadoop without affecting the larger jobs. It runs on existing unmodified Hadoop clusters facilitating its adoption. Piranha exploits characteristics of short jobs learned from production workloads at Yahoo! clusters to reduce the latency of such jobs. To demonstrate Piranha's effectiveness, we evaluated its performance using three realistic short queries. Piranha was able to reduce the queries' response times by up to 71\%.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sadoghi:2013:MUD, author = "Mohammad Sadoghi and Kenneth A. Ross and Mustafa Canim and Bishwaranjan Bhattacharjee", title = "Making updates disk-{I/O} friendly using {SSDs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "997--1008", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Multiversion databases store both current and historical data. Rows are typically annotated with timestamps representing the period when the row is/was valid. We develop novel techniques for reducing index maintenance in multiversion databases, so that indexes can be used effectively for analytical queries over current data without being a heavy burden on transaction throughput. To achieve this end, we re-design persistent index data structures in the storage hierarchy to employ an extra level of indirection. The indirection level is stored on solid state disks that can support very fast random I/Os, so that traversing the extra level of indirection incurs a relatively small overhead. The extra level of indirection dramatically reduces the number of magnetic disk I/Os that are needed for index updates, and localizes maintenance to indexes on updated attributes. Further, we batch insertions within the indirection layer in order to reduce physical disk I/Os for indexing new records. By reducing the index maintenance overhead on transactions, we enable operational data stores to create more indexes to support queries. We have developed a prototype of our indirection proposal by extending the widely used Generalized Search Tree (GiST) open-source project, which is also employed in PostgreSQL. Our working implementation demonstrates that we can significantly reduce index maintenance and/or query processing cost, by a factor of 3. For insertions of new records, our novel batching technique can save up to 90\% of the insertion time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Aji:2013:HGH, author = "Ablimit Aji and Fusheng Wang and Hoang Vo and Rubao Lee and Qiaoling Liu and Xiaodong Zhang and Joel Saltz", title = "{Hadoop GIS}: a high performance spatial data warehousing system over {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1009--1020", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Support of high performance queries on large volumes of spatial data becomes increasingly important in many application domains, including geospatial problems in numerous fields, location based services, and emerging scientific applications that are increasingly data- and compute-intensive. The emergence of massive scale spatial data is due to the proliferation of cost effective and ubiquitous positioning technologies, development of high resolution imaging technologies, and contribution from a large number of community users. There are two major challenges for managing and querying massive spatial data to support spatial queries: the explosion of spatial data, and the high computational complexity of spatial queries. In this paper, we present Hadoop-GIS --- a scalable and high performance spatial data warehousing system for running large scale spatial queries on Hadoop. Hadoop-GIS supports multiple types of spatial queries on MapReduce through spatial partitioning, customizable spatial query engine RESQUE, implicit parallel spatial query execution on MapReduce, and effective methods for amending query results through handling boundary objects. Hadoop-GIS utilizes global partition indexing and customizable on demand local spatial indexing to achieve efficient query processing. Hadoop-GIS is integrated into Hive to support declarative spatial queries with an integrated architecture. Our experiments have demonstrated the high efficiency of Hadoop-GIS on query response and high scalability to run on commodity clusters. Our comparative experiments have showed that performance of Hadoop-GIS is on par with parallel SDBMS and outperforms SDBMS for compute-intensive queries. Hadoop-GIS is available as a set of library for processing spatial queries, and as an integrated software package in Hive.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bamba:2013:SCO, author = "Bhuvan Bamba and Siva Ravada and Ying Hu and Richard Anderson", title = "Statistics collection in {Oracle Spatial and Graph}: fast histogram construction for complex geometry objects", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1021--1032", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Oracle Spatial and Graph is a geographic information system (GIS) which provides users the ability to store spatial data alongside conventional data in Oracle. As a result of the coexistence of spatial and other data, we observe a trend towards users performing increasingly complex queries which involve spatial as well as non-spatial predicates. Accurate selectivity values, especially for queries with multiple predicates requiring joins among numerous tables, are essential for the database optimizer to determine a good execution plan. For queries involving spatial predicates, this requires that reasonably accurate statistics collection has been performed on the spatial data. For extensible data cartridges such as Oracle Spatial and Graph, the optimizer expects to receive accurate predicate selectivity and cost values from functions implemented within the data cartridge. Although statistics collection for spatial data has been researched in academia for a few years; to the best of our knowledge, this is the first work to present spatial statistics collection implementation details for a commercial GIS database. In this paper, we describe our experiences with implementation of statistics collection methods for complex geometry objects within Oracle Spatial and Graph. Firstly, we exemplify issues with previous partitioning-based algorithms in presence of complex geometry objects and suggest enhancements which resolve the issues. Secondly, we propose a main memory implementation which not only speeds up the disk-based partitioning algorithms but also utilizes existing R-tree indexes to provide surprisingly accurate selectivity estimates. Last but not the least, we provide extensive experimental results and an example study which displays the efficacy of our approach on Oracle query performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Akidau:2013:MFT, author = "Tyler Akidau and Alex Balikov and Kaya Bekiroglu and Slava Chernyak and Josh Haberman and Reuven Lax and Sam McVeety and Daniel Mills and Paul Nordstrom and Sam Whittle", title = "{MillWheel}: fault-tolerant stream processing at {Internet} scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1033--1044", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "MillWheel is a framework for building low-latency data-processing applications that is widely used at Google. Users specify a directed computation graph and application code for individual nodes, and the system manages persistent state and the continuous flow of records, all within the envelope of the framework's fault-tolerance guarantees. This paper describes MillWheel's programming model as well as its implementation. The case study of a continuous anomaly detector in use at Google serves to motivate how many of MillWheel's features are used. MillWheel's programming model provides a notion of logical time, making it simple to write time-based aggregations. MillWheel was designed from the outset with fault tolerance and scalability in mind. In practice, we find that MillWheel's unique combination of scalability, fault tolerance, and a versatile programming model lends itself to a wide variety of problems at Google.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rae:2013:OAS, author = "Ian Rae and Eric Rollins and Jeff Shute and Sukhdeep Sodhi and Radek Vingralek", title = "Online, asynchronous schema change in {F1}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1045--1056", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We introduce a protocol for schema evolution in a globally distributed database management system with shared data, stateless servers, and no global membership. Our protocol is asynchronous--it allows different servers in the database system to transition to a new schema at different times--and online--all servers can access and update all data during a schema change. We provide a formal model for determining the correctness of schema changes under these conditions, and we demonstrate that many common schema changes can cause anomalies and database corruption. We avoid these problems by replacing corruption-causing schema changes with a sequence of schema changes that is guaranteed to avoid corrupting the database so long as all servers are no more than one schema version behind at any time. Finally, we discuss a practical implementation of our protocol in F1, the database management system that stores data for Google AdWords.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abraham:2013:SDD, author = "Lior Abraham and John Allen and Oleksandr Barykin and Vinayak Borkar and Bhuwan Chopra and Ciprian Gerea and Daniel Merl and Josh Metzler and David Reiss and Subbu Subramanian and Janet L. Wiener and Okay Zed", title = "{Scuba}: diving into data at {Facebook}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1057--1067", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Facebook takes performance monitoring seriously. Performance issues can impact over one billion users so we track thousands of servers, hundreds of PB of daily network traffic, hundreds of daily code changes, and many other metrics. We require latencies of under a minute from events occuring (a client request on a phone, a bug report filed, a code change checked in) to graphs showing those events on developers' monitors. Scuba is the data management system Facebook uses for most real-time analysis. Scuba is a fast, scalable, distributed, in-memory database built at Facebook. It currently ingests millions of rows (events) per second and expires data at the same rate. Scuba stores data completely in memory on hundreds of servers each with 144 GB RAM. To process each query, Scuba aggregates data from all servers. Scuba processes almost a million queries per day. Scuba is used extensively for interactive, ad hoc, analysis queries that run in under a second over live data. In addition, Scuba is the workhorse behind Facebook's code regression analysis, bug report monitoring, ads revenue monitoring, and performance debugging.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shute:2013:FDS, author = "Jeff Shute and Radek Vingralek and Bart Samwel and Ben Handy and Chad Whipkey and Eric Rollins and Mircea Oancea and Kyle Littlefield and David Menestrina and Stephan Ellner and John Cieslewicz and Ian Rae and Traian Stancescu and Himani Apte", title = "{F1}: a distributed {SQL} database that scales", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1068--1079", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "F1 is a distributed relational database system built at Google to support the AdWords business. F1 is a hybrid database that combines high availability, the scalability of NoSQL systems like Bigtable, and the consistency and usability of traditional SQL databases. F1 is built on Spanner, which provides synchronous cross-datacenter replication and strong consistency. Synchronous replication implies higher commit latency, but we mitigate that latency by using a hierarchical schema model with structured data types and through smart application design. F1 also includes a fully functional distributed SQL query engine and automatic change tracking and publishing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Raman:2013:DBA, author = "Vijayshankar Raman and Gopi Attaluri and Ronald Barber and Naresh Chainani and David Kalmuk and Vincent KulandaiSamy and Jens Leenstra and Sam Lightstone and Shaorong Liu and Guy M. Lohman and Tim Malkemus and Rene Mueller and Ippokratis Pandis and Berni Schiefer and David Sharpe and Richard Sidle and Adam Storm and Liping Zhang", title = "{DB2} with {BLU} acceleration: so much more than just a column store", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1080--1091", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "DB2 with BLU Acceleration deeply integrates innovative new techniques for defining and processing column-organized tables that speed read-mostly Business Intelligence queries by 10 to 50 times and improve compression by 3 to 10 times, compared to traditional row-organized tables, without the complexity of defining indexes or materialized views on those tables. But DB2 BLU is much more than just a column store. Exploiting frequency-based dictionary compression and main-memory query processing technology from the Blink project at IBM Research --- Almaden, DB2 BLU performs most SQL operations --- predicate application (even range predicates and IN-lists), joins, and grouping --- on the compressed values, which can be packed bit-aligned so densely that multiple values fit in a register and can be processed simultaneously via SIMD (single-instruction, multipledata) instructions. Designed and built from the ground up to exploit modern multi-core processors, DB2 BLU's hardware-conscious algorithms are carefully engineered to maximize parallelism by using novel data structures that need little latching, and to minimize data-cache and instruction-cache misses. Though DB2 BLU is optimized for in-memory processing, database size is not limited by the size of main memory. Fine-grained synopses, late materialization, and a new probabilistic buffer pool protocol for scans minimize disk I/Os, while aggressive prefetching reduces I/O stalls. Full integration with DB2 ensures that DB2 with BLU Acceleration benefits from the full functionality and robust utilities of a mature product, while still enjoying order-of-magnitude performance gains from revolutionary technology without even having to change the SQL, and can mix column-organized and row-organized tables in the same tablespace and even within the same query.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ovsiannikov:2013:QFS, author = "Michael Ovsiannikov and Silvius Rus and Damian Reeves and Paul Sutter and Sriram Rao and Jim Kelly", title = "The {Quantcast File System}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1092--1101", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Quantcast File System (QFS) is an efficient alternative to the Hadoop Distributed File System (HDFS). QFS is written in C++, is plugin compatible with Hadoop MapReduce, and offers several efficiency improvements relative to HDFS: 50\% disk space savings through erasure coding instead of replication, a resulting doubling of write throughput, a faster name node, support for faster sorting and logging through a concurrent append feature, a native command line client much faster than hadoop fs, and global feedback-directed I/O device management. As QFS works out of the box with Hadoop, migrating data from HDFS to QFS involves simply executing hadoop distcp. QFS is being developed fully open source and is available under an Apache license from https://github.com/quantcast/qfs. Multi-petabyte QFS instances have been in heavy production use since 2011.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bellamkonda:2013:ABD, author = "Srikanth Bellamkonda and Hua-Gang Li and Unmesh Jagtap and Yali Zhu and Vince Liang and Thierry Cruanes", title = "Adaptive and big data scale parallel execution in {Oracle}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1102--1113", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper showcases some of the newly introduced parallel execution methods in Oracle RDBMS. These methods provide highly scalable and adaptive evaluation for the most commonly used SQL operations --- joins, group-by, rollup/cube, grouping sets, and window functions. The novelty of these techniques is their use of multi-stage parallelization models, accommodation of optimizer mistakes, and the runtime parallelization and data distribution decisions. These parallel plans adapt based on the statistics gathered on the real data at query execution time. We realized enormous performance gains from these adaptive parallelization techniques. The paper also discusses our approach to parallelize queries with operations that are inherently serial. We believe all these techniques will make their way into big data analytics and other massively parallel database systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bellare:2013:WSM, author = "Kedar Bellare and Carlo Curino and Ashwin Machanavajihala and Peter Mika and Mandar Rahurkar and Aamod Sane", title = "{WOO}: a scalable and multi-tenant platform for continuous knowledge base synthesis", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1114--1125", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Search, exploration and social experience on the Web has recently undergone tremendous changes with search engines, web portals and social networks offering a different perspective on information discovery and consumption. This new perspective is aimed at capturing user intents, and providing richer and highly connected experiences. The new battleground revolves around technologies for the ingestion, disambiguation and enrichment of entities from a variety of structured and unstructured data sources --- we refer to this process as knowledge base synthesis. This paper presents the design, implementation and production deployment of the Web Of Objects (WOO) system, a Hadoop-based platform tackling such challenges. WOO has been designed and implemented to enable various products in Yahoo! to synthesize knowledge bases (KBs) of entities relevant to their domains. Currently, the implementation of WOO we describe is used by various Yahoo! properties such as Intonow, Yahoo! Local, Yahoo! Events and Yahoo! Search. This paper highlights: (i) challenges that arise in designing, building and operating a platform that handles multi-domain, multi-version, and multi-tenant disambiguation of web-scale knowledge bases (hundreds of millions of entities), (ii) the architecture and technical solutions we devised, and (iii) an evaluation on real-world production datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gattani:2013:EEL, author = "Abhishek Gattani and Digvijay S. Lamba and Nikesh Garera and Mitul Tiwari and Xiaoyong Chai and Sanjib Das and Sri Subramaniam and Anand Rajaraman and Venky Harinarayan and AnHai Doan", title = "Entity extraction, linking, classification, and tagging for social media: a {Wikipedia}-based approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1126--1137", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many applications that process social data, such as tweets, must extract entities from tweets (e.g., ``Obama'' and ``Hawaii'' in ``Obama went to Hawaii''), link them to entities in a knowledge base (e.g., Wikipedia), classify tweets into a set of predefined topics, and assign descriptive tags to tweets. Few solutions exist today to solve these problems for social data, and they are limited in important ways. Further, even though several industrial systems such as OpenCalais have been deployed to solve these problems for text data, little if any has been published about them, and it is unclear if any of the systems has been tailored for social media. In this paper we describe in depth an end-to-end industrial system that solves these problems for social data. The system has been developed and used heavily in the past three years, first at Kosmix, a startup, and later at WalmartLabs. We show how our system uses a Wikipedia-based global ``real-time'' knowledge base that is well suited for social data, how we interleave the tasks in a synergistic fashion, how we generate and use contexts and social signals to improve task accuracy, and how we scale the system to the entire Twitter firehose. We describe experiments that show that our system outperforms current approaches. Finally we describe applications of the system at Kosmix and WalmartLabs, and lessons learned.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Elmeleegy:2013:OTD, author = "Hazem Elmeleegy and Yinan Li and Yan Qi and Peter Wilmot and Mingxi Wu and Santanu Kolay and Ali Dasdan and Songting Chen", title = "Overview of turn data management platform for digital advertising", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1138--1149", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper gives an overview of Turn Data Management Platform (DMP). We explain the purpose of this type of platforms, and show how it is positioned in the current digital advertising ecosystem. We also provide a detailed description of the key components in Turn DMP. These components cover the functions of (1) data ingestion and integration, (2) data warehousing and analytics, and (3) real-time data activation. For all components, we discuss the main technical and research challenges, as well as the alternative design choices. One of the main goals of this paper is to highlight the central role that data management is playing in shaping this fast growing multi-billion dollars industry.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Curtiss:2013:USS, author = "Michael Curtiss and Iain Becker and Tudor Bosman and Sergey Doroshenko and Lucian Grijincu and Tom Jackson and Sandhya Kunnatur and Soren Lassen and Philip Pronin and Sriram Sankar and Guanghao Shen and Gintaras Woss and Chao Yang and Ning Zhang", title = "{Unicorn}: a system for searching the social graph", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1150--1161", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Unicorn is an online, in-memory social graph-aware indexing system designed to search trillions of edges between tens of billions of users and entities on thousands of commodity servers. Unicorn is based on standard concepts in information retrieval, but it includes features to promote results with good social proximity. It also supports queries that require multiple round-trips to leaves in order to retrieve objects that are more than one edge away from source nodes. Unicorn is designed to answer billions of queries per day at latencies in the hundreds of milliseconds, and it serves as an infrastructural building block for Facebook's Graph Search product. In this paper, we describe the data model and query language supported by Unicorn. We also describe its evolution as it became the primary backend for Facebook's search offerings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ramazzina:2013:NSC, author = "Sergio Ramazzina and Chiara L. Ballari and Daniela Somenzi", title = "A new service for customer care based on the {Trentorise} bigdata platform", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1162--1163", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we give an overview of a platform implemented in collaboration with the University of Trento to deliver an innovative family of customer care services.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Antonelli:2013:EDM, author = "Fabrizio Antonelli and Antonino Casella and Cristiana Chitic and Roberto Larcher and Giovanni Torrisi", title = "Exploiting the diversity, mass and speed of territorial data by {TELCO Operator} for better user services", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1164--1165", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bedini:2013:TBD, author = "Ivan Bedini and Benedikt Elser and Yannis Velegrakis", title = "The {Trento} big data platform for public administration and large companies: use cases and opportunities", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1166--1167", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tran:2013:DQO, author = "Nga Tran and Sreenath Bodagala and Jaimin Dave", title = "Designing query optimizers for big data problems of the future", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1168--1169", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Vertica SQL Query Optimizer was written from the ground up for the Vertica Analytic Database. Its design, and the tradeoffs we encountered during implementation, support the case that the full power of novel database systems can be realized only with a custom Query Optimizer, carefully crafted exclusively for the system in which it operates.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Franceschini:2013:HMV, author = "Monica Franceschini", title = "How to maximize the value of big data with the open source {SpagoBI} suite through a comprehensive approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1170--1171", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper describes the approach adopted by SpagoBI suite (\path=www.spagobi.org=) to manage large volumes of heterogeneous structured and unstructured data, to perform real-time Business Intelligence on Big Data streaming and to give meaning to data through the semantic analysis. SpagoBI supplies meaningful data insights through the main concept of persistable and schedulable datasets, and using tools such as self-service BI, ad-hoc reporting, interactive dashboards and explorative analysis.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chang:2013:CAC, author = "Edward Y. Chang", title = "Context-aware computing: opportunities and open issues", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1172--1173", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A 2011 Gartner report [3] describes context-aware computing as a game-changing opportunity for enterprises to improve both productivity and profits. Context-aware computing is about making applications and content more relevant to a user's context, e.g., when and where the user is, thereby improving user experience. For instance, a coupon delivered to a user at a wrong time or at a wrong location is considered a nuisance. On the contrary, receiving a timely, usable coupon before purchasing a merchandise is a treat. Context-aware computing is not a new concept, but the ongoing mobile revolution makes it both necessary and feasible. o Necessary because the mobile phone display is small and information must be delivered with much higher relevance and precision to meet user needs. o Feasible because small, light-weight mobile devices allow users to almost always carry them around, and much can be learned via a phone about its user's habits and states. Context-aware computing involves first acquiring context and then taking context-dependent actions. For instance, a phone can sense a user's location and turn off its GPS unit to conserve power when the user enters a building, or it can collect EKG signals of a user and trigger an alert if the user's heart beats irregularly. Similarly, a restaurant can send a coupon to a user when that user is queued up in front of a nearby restaurant. The useful context can be divided into three categories: information on the user (knowledge of habits, emotional state, biophysiological conditions), the user's environment (time, location, co-location of others, social interaction), and the user's tasks (transportation mode, engaged tasks, general goals) [4]. Context-aware computing can be applied to benefit applications in many areas including but not limited to information retrieval, facility management, productivity enhancement, in addition to the aforementioned three examples representing power management, health care, and commerce, respectively.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hassanzadeh:2013:NGD, author = "Oktie Hassanzadeh and Anastasios Kementsietsidis and Benny Kimelfeld and Rajasekar Krishnamurthy and Fatma {\"O}zcan and Ippokratis Pandis", title = "Next generation data analytics at {IBM} research", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1174--1175", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Brunato:2013:LIO, author = "Mauro Brunato and Roberto Battiti", title = "Learning and intelligent optimization {(LION)}: one ring to rule them all", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1176--1177", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Almost by definition, optimization is a source of a tremendous power for automatically improving processes, decisions, products and services. But its potential is still largely unexploited in most real-world contexts. One of the main reasons blocking its widespread adoption is that standard optimization assumes the existence of a function $ f(x) $ to be minimized, while in most real-world business contexts this function does not exist or is extremely difficult and costly to build by hand. Machine learning (ML) comes to the rescue: the function (the model) can be built by machine learning starting from abundant data. By Learning and Intelligent Optimization (LION) we mean this combination of learning from data and optimization which can be applied to complex, dynamic, stochastic contexts. This combination dramatically increases the automation level and puts more power directly in the hands of decision makers without resorting to intermediate layers of data scientists (LION has a huge potential for a self-service usage). Reaching this goal is a huge challenge and it will require research at the boundary between two areas, machine learning and optimization, which have been traditionally separated.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lomet:2013:MSS, author = "David Lomet", title = "{Microsoft SQL} server's integrated database approach for modern applications and hardware", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1178--1179", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently, there has been much renewed interest in re-architecting database systems to exploit new hardware. While some efforts have suggested that one needs specialized engines (``one size does not fit all''), the approach pursued by Microsoft's SQL Server has been to integrate multiple elements into a common architecture. This brings customers what they want by reducing data impedance mismatches between database systems that they are using for multiple purposes. This integration is, of course, more easily said than done. But this is, in fact, precisely what the SQL Server team has done.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hacigumus:2013:OMS, author = "Hakan Hac{\'\i}g{\"u}m{\"u}s and Jagan Sankaranarayanan and Junichi Tatemura and Jeff LeFevre and Neoklis Polyzotis", title = "{Odyssey}: a multistore system for evolutionary analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1180--1181", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bouquet:2013:GEN, author = "Paolo Bouquet and Andrea Molinari", title = "A global {Entity Name System (ENS)} for data ecosystems", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1182--1183", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "After decades of schema-centric research on data management and integration, the evolution of data on the web and the adoption of resource-based models seem to have shifted the focus towards an entity-centric approach. Our thesis is that the missing element to achieve the full potential of this approach is the development of what we call an Entity Name System (ENS), namely a system which provides a collection of general services for managing the lifecycle of globally unique identifiers in an open and decentralized environment. The claim is that this system can indeed play the coordination role that the DNS played for the document-centric development of the current web.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sikka:2013:SHE, author = "Vishal Sikka and Franz F{\"a}rber and Anil Goel and Wolfgang Lehner", title = "{SAP HANA}: the evolution from a modern main-memory data platform to an enterprise application platform", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1184--1185", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "SAP HANA is a pioneering, and one of the best performing, data platform designed from the grounds up to heavily exploit modern hardware capabilities, including SIMD, and large memory and CPU footprints. As a comprehensive data management solution, SAP HANA supports the complete data life cycle encompassing modeling, provisioning, and consumption. This extended abstract outlines the vision and planned next step of the SAP HANA evolution growing from a core data platform into an innovative enterprise application platform as the foundation for current as well as novel business applications in both on-premise and on-demand scenarios. We argue that only a holistic system design rigorously applying co-design at different levels may yield a highly optimized and sustainable platform for modern enterprise applications.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nambiar:2013:KTR, author = "Raghunath Nambiar and Meikel Poess", title = "Keeping the {TPC} relevant!", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1186--1187", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Transaction Processing Performance Council (TPC) is a nonprofit organization founded in 1988 to define transaction processing and database benchmarks. Since then, the TPC has played a crucial role in providing the industry with relevant standards for total system performance, price-performance, and energy-efficiency comparisons. TPC benchmarks are widely used by database researchers and academia. Historically known for database-centric standards, the TPC has developed a benchmark for virtualization and is currently developing a multisource data integration benchmark. The technology landscape is changing at a rapid pace, challenging industry experts and researchers to develop innovative techniques for evaluating, measuring, and characterizing the performance of modern application systems. The Technology Conference series on Performance Evaluation and Benchmarking (TPCTC), introduced in 2009, and the new TPC-Express initiatives are steps taken by the TPC to be relevant in the coming years and beyond.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dong:2013:BDI, author = "Xin Luna Dong and Divesh Srivastava", title = "Big data integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1188--1189", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Big Data era is upon us: data is being generated, collected and analyzed at an unprecedented scale, and data-driven decision making is sweeping through society. Since the value of data explodes when it can be linked and fused with other data, addressing the big data integration (BDI) challenge is critical to realizing the promise of Big Data. BDI differs from traditional data integration in many dimensions: (i) the number of data sources, even for a single domain, has grown to be in the tens of thousands, (ii) many of the data sources are very dynamic, as a huge amount of newly collected data are continuously made available, (iii) the data sources are extremely heterogeneous in their structure, with considerable variety even for substantially similar entities, and (iv) the data sources are of widely differing qualities, with significant differences in the coverage, accuracy and timeliness of data provided. This tutorial explores the progress that has been made by the data integration community on the topics of schema mapping, record linkage and data fusion in addressing these novel challenges faced by big data integration, and identifies a range of open problems for the community.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Viglas:2013:JTC, author = "Stratis D. Viglas", title = "Just-in-time compilation for {SQL} query processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1190--1191", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Just-in-time compilation of SQL queries into native code has recently emerged as a viable alternative to interpretation-based query processing. We present the salient results of research in this fresh area, addressing all aspects of the query processing stack. Throughout the discussion we draw analogies to the general code generation techniques used in contemporary compiler technology. At the same time we describe the open research problems of the area.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ailamaki:2013:TST, author = "Anastasia Ailamaki and Ryan Johnson and Ippokratis Pandis and P{\'\i}nar T{\"o}z{\"u}n", title = "Toward scalable transaction processing: evolution of {Shore-MT}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1192--1193", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Designing scalable transaction processing systems on modern multicore hardware has been a challenge for almost a decade. The typical characteristics of transaction processing workloads lead to a high degree of unbounded communication on multicores for conventional system designs. In this tutorial, we initially present a systematic way of eliminating scalability bottlenecks of a transaction processing system, which is based on minimizing the unbounded communication. Then, we show several techniques that apply the presented methodology to minimize logging, locking, latching etc. related bottlenecks of transaction processing systems. In parallel, we demonstrate the internals of the Shore-MT storage manager and how they have evolved over the years in terms of scalability on multicore hardware through such techniques. We also teach how to use Shore-MT with the various design options it offers through its application layer Shore-Kits and Metadata Frontend.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Elmore:2013:TDV, author = "Aaron J. Elmore and Carlo Curino and Divyakant Agrawal and Amr {El Abbadi}", title = "Towards database virtualization for database as a service", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1194--1195", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Advances in operating system and storage-level virtualization technologies have enabled the effective consolidation of heterogeneous applications in a shared cloud infrastructure. Novel research challenges arising from this new shared environment include load balancing, workload estimation, resource isolation, machine replication, live migration, and an emergent need of automation to handle large scale operations with minimal manual intervention. Given that databases are at the core of most applications that are deployed in the cloud, database management systems (DBMSs) represent a very important technology component that needs to be virtualized in order to realize the benefits of virtualization from autonomic management of data-intensive applications in large scale data-centers. The goal of this tutorial is to survey the techniques used in providing elasticity in virtual machine systems, shared storage systems, and survey database research on multitenant architectures and elasticity primitives. This foundation of core Database as a Service advances, together with a primer of important related topics in OS and storage-level virtualization, are central for anyone that wants to operate in this area of research.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mokbel:2013:MSN, author = "Mohamed F. Mokbel and Mohamed Sarwat", title = "Mobility and social networking: a data management perspective", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "11", pages = "1196--1197", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:56:54 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This tutorial presents the state-of-the-art research that lies at the intersection of two hot topics in the data management community: (1) social networking and (2) mobility. In this tutorial, we give an overview of existing research work, systems, and applications related to both social networking and mobility. In addition, we introduce several resources (i.e., datasets, software tools) as well as a list of promising research directions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xue:2013:DSD, author = "Andy Yuan Xue and Rui Zhang and Yu Zheng and Xing Xie and Jianhui Yu and Yong Tang", title = "{DesTeller}: a system for destination prediction based on trajectories with privacy protection", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1198--1201", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Destination prediction is an essential task for a number of emerging location based applications such as recommending sightseeing places and sending targeted advertisements. A common approach to destination prediction is to derive the probability of a location being the destination based on historical trajectories. However, existing techniques suffer from the ``data sparsity problem'', i.e., the number of available historical trajectories is far from sufficient to cover all possible trajectories. This problem considerably limits the amount of query trajectories whose predicted destinations can be inferred. In this demonstration, we showcase a system named ``DesTeller'' that is interactive, user-friendly, publicly accessible, and capable of answering real-time queries. The underlying algorithm Sub-Trajectory Synthesis (SubSyn) successfully addressed the data sparsity problem and is able to predict destinations for almost every query submitted by travellers. We also consider the privacy protection issue in case an adversary uses SubSyn algorithm to derive sensitive location information of users.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2013:SPS, author = "Zhe Chen and Michael Cafarella and Jun Chen and Daniel Prevo and Junfeng Zhuang", title = "{Senbazuru}: a prototype spreadsheet database management system", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1202--1205", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Spreadsheets have become a critical data management tool, but they lack explicit relational metadata, making it difficult to join or integrate data across multiple spreadsheets. Because spreadsheet data are widely available on a huge range of topics, a tool that allows easy spreadsheet integration would be hugely beneficial for a variety of users. We demonstrate that Senbazuru, a prototype spreadsheet database management system (SSDBMS), is able to extract relational information from spreadsheets. By doing so, it opens up opportunities for integration among spreadsheets and with other relational sources. Senbazuru allows users to search for relevant spreadsheets in a large corpus, probabilistically constructs a relational version of the data, and offers several relational operations over the resulting extracted data (including joins to other spreadsheet data). Our demonstration is available on two clients: a JavaScript-rich Web site and a touch interface on the iPad. During the demo, Senbazuru will allow VLDB participants to search spreadsheets, extract relational data from them, and apply relational operators such as select and join.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Smits:2013:RFQ, author = "Gr{\'e}gory Smits and Olivier Pivert and Thomas Girault", title = "{ReqFlex}: fuzzy queries for everyone", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1206--1209", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demonstration we present a complete fuzzy-set-based approach to preference queries that tackles the two main questions raised by the introduction of flexibility and personalization when querying relational databases: (i) how to efficiently execute preference queries? and, (ii) how to help users define preferences and queries? As an answer to the first question, we propose PostgreSQL\_f, a module implemented on top of PostgreSQL to handle fuzzy queries. To answer the second question, we propose ReqFlex an intuitive user interface to the definition of preferences and the construction of fuzzy queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kaufmann:2013:CIT, author = "Martin Kaufmann and Panagiotis Vagenas and Peter M. Fischer and Donald Kossmann and Franz F{\"a}rber", title = "Comprehensive and interactive temporal query processing with {SAP HANA}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1210--1213", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demo, we present a prototype of a main memory database system which provides a wide range of temporal operators featuring predictable and interactive response times. Much of real-life data is temporal in nature, and there is an increasing application demand for temporal models and operations in databases. Nevertheless, SQL:2011 has only recently overcome a decade-long standstill on standardizing temporal features. As a result, few database systems provide any temporal support, and even those only have limited expressiveness and poor performance. Our prototype combines an in-memory column store and a novel, generic temporal index structure named Timeline Index. As we will show on a workload based on real customer use cases, it achieves predictable and interactive query performance for a wide range of temporal query types and data sizes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Grust:2013:FDT, author = "Torsten Grust and Nils Schweinsberg and Alexander Ulrich", title = "Functions are data too: defunctionalization for {PL\slash SQL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1214--1217", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate a full-fledged implementation of first-class functions for the widely used PL/SQL database programming language. Functions are treated as regular data items that may be (1) constructed at query runtime, (2) stored in and retrieved from tables, (3) assigned to variables, and (4) passed to and from other (higher-order) functions. The resulting PL/SQL dialect concisely and elegantly expresses a wide range of new query idioms which would be cumbersome to formulate if functions remained second-class citizens. We include a diverse set of application scenarios that make these advantages tangible. First-class PL/SQL functions require featherweight syntactic extensions only and come with a non-invasive implementation-- the defunctionalization transformation--that can entirely be built on top of existing relational DBMS infrastructure. An interactive demonstrator helps users to experiment with the ``function as data'' paradigm and to earn a solid intuition of its inner workings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ebaid:2013:NGD, author = "Amr Ebaid and Ahmed Elmagarmid and Ihab F. Ilyas and Mourad Ouzzani and Jorge-Arnulfo Quiane-Ruiz and Nan Tang and Si Yin", title = "{NADEEF}: a generalized data cleaning system", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1218--1221", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present NADEEF, an extensible, generic and easy-to-deploy data cleaning system. NADEEF distinguishes between a programming interface and a core to achieve generality and extensibility. The programming interface allows users to specify data quality rules by writing code that implements predefined classes. These classes uniformly define what is wrong with the data and (possibly) how to fix it. We will demonstrate the following features provided by NADEEF. (1) Heterogeneity: The programming interface can be used to express many types of data quality rules beyond the well known CFDs (FDs), MDs and ETL rules. (2) Interdependency: The core algorithms can interleave multiple types of rules to detect and repair data errors. (3) Deployment and extensibility: Users can easily customize NADEEF by defining new types of rules, or by extending the core. (4) Metadata management and data custodians: We show a live data quality dashboard to effectively involve users in the data cleaning process.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bergamaschi:2013:QKS, author = "Sonia Bergamaschi and Francesco Guerra and Matteo Interlandi and Raquel Trillo-Lado and Yannis Velegrakis", title = "{QUEST}: a keyword search system for relational data based on semantic and machine learning techniques", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1222--1225", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We showcase QUEST (QUEry generator for STructured sources), a search engine for relational databases that combines semantic and machine learning techniques for transforming keyword queries into meaningful SQL queries. The search engine relies on two approaches: the forward, providing mappings of keywords into database terms (names of tables and attributes, and domains of attributes), and the backward, computing the paths joining the data structures identified in the forward step. The results provided by the two approaches are combined within a probabilistic framework based on the Dempster-Shafer Theory. We demonstrate QUEST capabilities, and we show how, thanks to the flexibility obtained by the probabilistic combination of different techniques, QUEST is able to compute high quality results even with few training data and/or with hidden data sources such as those found in the Deep Web.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bogh:2013:GNA, author = "Kenneth S. B{\o}gh and Anders Skovsgaard and Christian S. Jensen", title = "{GroupFinder}: a new approach to top-$k$ point-of-interest group retrieval", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1226--1229", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The notion of point-of-interest (PoI) has existed since paper road maps began to include markings of useful places such as gas stations, hotels, and tourist attractions. With the introduction of geopositioned mobile devices such as smartphones and mapping services such as Google Maps, the retrieval of PoIs relevant to a user's intent has became a problem of automated spatio-textual information retrieval. Over the last several years, substantial research has gone into the invention of functionality and efficient implementations for retrieving nearby PoIs. However, with a couple of exceptions existing proposals retrieve results at single-PoI granularity. We assume that a mobile device user issues queries consisting of keywords and an automatically supplied geo-position, and we target the common case where the user wishes to find nearby groups of PoIs that are relevant to the keywords. Such groups are relevant to users who wish to conveniently explore several options before making a decision such as to purchase a specific product. Specifically, we demonstrate a practical proposal for finding top-$k$PoI groups in response to a query. We show how problem parameter settings can be mapped to options that are meaningful to users. Further, although this kind of functionality is prone to combinatorial explosion, we will demonstrate that the functionality can be supported efficiently in practical settings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Eldawy:2013:DSE, author = "Ahmed Eldawy and Mohamed F. Mokbel", title = "A demonstration of {SpatialHadoop}: an efficient {MapReduce} framework for spatial data", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1230--1233", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This demo presents SpatialHadoop as the first full-fledged MapReduce framework with native support for spatial data. SpatialHadoop is a comprehensive extension to Hadoop that pushes spatial data inside the core functionality of Hadoop. SpatialHadoop runs existing Hadoop programs as is, yet, it achieves order(s) of magnitude better performance than Hadoop when dealing with spatial data. SpatialHadoop employs a simple spatial high level language, a two-level spatial index structure, basic spatial components built inside the MapReduce layer, and three basic spatial operations: range queries, $k$-NN queries, and spatial join. Other spatial operations can be similarly deployed in SpatialHadoop. We demonstrate a real system prototype of SpatialHadoop running on an Amazon EC2 cluster against two sets of real spatial data obtained from Tiger Files and OpenStreetMap with sizes 60GB and 300GB, respectively.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abbasoglu:2013:APC, author = "Mehmet Ali Abbasoglu and Bugra Gedik and Hakan Ferhatosmanoglu", title = "Aggregate profile clustering for telco analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1234--1237", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many telco analytics require maintaining call profiles based on recent customer call patterns. Such call profiles are typically organized as aggregations computed at different time scales over the recent customer interactions. Customer call profiles are key inputs for analytics targeted at improving operations, marketing, and sales of telco providers. Many of these analytics require clustering customer call profiles, so that customers with similar calling patterns can be modeled as a group. Example applications include optimizing tariffs, customer segmentation, and usage forecasting. In this demo, we present our system for scalable aggregate profile clustering in a streaming setting. We focus on managing anonymized segments of customers for tariff optimization. Due to the large number of customers, maintaining profile clusters have high processing and memory resource requirements. In order to tackle this problem, we apply distributed stream processing. However, in the presence of distributed state, it is a major challenge to partition the profiles over machines (nodes) such that memory and computation balance is maintained, while keeping the clustering accuracy high. Furthermore, to adapt to potentially changing customer calling patterns, the partitioning of profiles to machines should be continuously revised, yet one should minimize the migration of profiles so as not to disturb the online processing of updates. We provide a re-partitioning technique that achieves all these goals. We keep micro-cluster summaries at each node, collect these summaries at a centralize node, and use a greedy algorithm with novel affinity heuristics to revise the partitioning. We present a demo that showcases our Storm and Hbase based implementation of the proposed solution in the context of a customer segmentation application.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2013:RRO, author = "Luying Chen and Stefano Ortona and Giorgio Orsi and Michael Benedikt", title = "{ROSeAnn}: reconciling opinions of semantic annotators", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1238--1241", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Named entity extractors can be used to enrich both text and Web documents with semantic annotations. While originally focused on a few standard entity types, the ecosystem of annotators is becoming increasingly diverse, with recognition capabilities ranging from generic to specialised entity types. Both the overlap and the diversity in annotator vocabularies motivate the need for managing and integrating semantic annotations: allowing users to see the results of multiple annotations and to merge them into a unified solution. We demonstrate ROSEANN, a system for the management of semantic annotations. ROSEANN provides users with a unified view over the opinion of multiple independent annotators both on text and Web documents. It allows users to understand and reconcile conflicts between annotations via ontology-aware aggregation. ROSEANN incorporates both supervised aggregation, appropriate when representative training data is available, and an unsupervised method based on the notion of weighted-repair.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sarwat:2013:RAR, author = "Mohamed Sarwat and James Avery and Mohamed F. Mokbel", title = "{RecDB} in action: recommendation made easy in relational databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1242--1245", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we demonstrate RecDB; a full-fledged database system that provides personalized recommendation to users. We implemented RecDB using an existing open source database system PostgreSQL, and we demonstrate the effectiveness of RecDB using two existing recommendation applications (1) Restaurant Recommendation, (2) Movie Recommendation. To make the demo even more interactive, we showcase a novel application that recommends research papers presented at VLDB 2013 to the conference attendees based on their publication history in DBLP.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Drosou:2013:PTE, author = "Marina Drosou and Evaggelia Pitoura", title = "{POIKILO}: a tool for evaluating the results of diversification models and algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1246--1249", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Search result diversification has attracted considerable attention as a means of improving the quality of results retrieved by user queries. In this demonstration, we present Poikilo, a tool to assist users in locating and evaluating diverse results. We provide implementations of a wide suite of models and algorithms to compute and compare diverse results. Users can tune various diversification parameters, combine diversity with relevance and also see how diverse results change over time in the case of streaming data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Amsterdamer:2013:CMA, author = "Yael Amsterdamer and Yael Grossman and Tova Milo and Pierre Senellart", title = "{CrowdMiner}: mining association rules from the crowd", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1250--1253", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This demo presents CrowdMiner, a system enabling the mining of interesting data patterns from the crowd. While traditional data mining techniques have been used extensively for finding patterns in classic databases, they are not always suitable for the crowd, mainly because humans tend to remember only simple trends and summaries rather than exact details. To address this, CrowdMiner employs a novel crowd-mining algorithm, designed specifically for this context. The algorithm iteratively chooses appropriate questions to ask the crowd, while aiming to maximize the knowledge gain at each step. We demonstrate CrowdMiner through a Well-Being portal, constructed interactively by mining the crowd, and in particular the conference participants, for common health related practices and trends.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2013:TTR, author = "Chen Chen and Hongzhi Yin and Junjie Yao and Bin Cui", title = "{TeRec}: a temporal recommender system over tweet stream", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1254--1257", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As social media further integrates into our daily lives, people are increasingly immersed in real-time social streams via services such as Twitter and Weibo. One important observation in these online social platforms is that users' interests and the popularity of topics shift very fast, which poses great challenges on existing recommender systems to provide the right topics at the right time. In this paper, we extend the online ranking technique and propose a temporal recommender system --- TeRec. In TeRec, when posting tweets, users can get recommendations of topics (hashtags) according to their real-time interests, they can also generate fast feedbacks according to the recommendations. TeRec provides the browser-based client interface which enables the users to access the real time topic recommendations, and the server side processes and stores the real-time stream data. The experimental study demonstrates the superiority of TeRec in terms of temporal recommendation accuracy.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shkapsky:2013:GQN, author = "Alexander Shkapsky and Kai Zeng and Carlo Zaniolo", title = "Graph queries in a next-generation {Datalog} system", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1258--1261", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent theoretical advances have enabled the use of special monotonic aggregates in recursion. These special aggregates make possible the concise expression and efficient implementation of a rich new set of advanced applications. Among these applications, graph queries are particularly important because of their pervasiveness in data intensive application areas. In this demonstration, we present our Deductive Application Language (DeAL) System, the first of a new generation of Deductive Database Systems that support applications that could not be expressed using regular stratification, or could be expressed using XY-stratification (also supported in DeAL) but suffer from inefficient execution. Using example queries, we will (i) show how complex graph queries can be concisely expressed using DeAL and (ii) illustrate the formal semantics and efficient implementation of these powerful new monotonic constructs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hendawi:2013:IFS, author = "Abdeltawab M. Hendawi and Jie Bao and Mohamed F. Mokbel", title = "{iRoad}: a framework for scalable predictive query processing on road networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1262--1265", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This demo presents the iRoad framework for evaluating predictive queries on moving objects for road networks. The main promise of the iRoad system is to support a variety of common predictive queries including predictive point query, predictive range query, predictive KNN query, and predictive aggregate query. The iRoad framework is equipped with a novel data structure, named reachability tree, employed to determine the reachable nodes for a moving object within a specified future time $ \Tau $. In fact, the reachability tree prunes the space around each object in order to significantly reduce the computation time. So, iRoad is able to scale up to handle real road networks with millions of nodes, and it can process heavy workloads on large numbers of moving objects. During the demo, audience will be able to interact with iRoad through a well designed Graphical User Interface to issue different types of predictive queries on a real road network, to obtain the predictive heatmap of the area of interest, to follow the creation and the dynamic update of the reachability tree around a specific moving object, and finally to examine the system efficiency and scalability.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nagendra:2013:SFS, author = "Mithila Nagendra and K. Sel{\c{c}}uk Candan", title = "{SkySuite}: a framework of skyline-join operators for static and stream environments", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1266--1269", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Efficient processing of skyline queries has been an area of growing interest over both static and stream environments. Most existing static and streaming techniques assume that the skyline query is applied to a single data source. Unfortunately, this is not true in many applications in which, due to the complexity of the schema, the skyline query may involve attributes belonging to multiple data sources. Recently, in the context of static environments, various hybrid skyline-join algorithms have been proposed. However, these algorithms suffer from several drawbacks: they often need to scan the data sources exhaustively in order to obtain the set of skyline-join results; moreover, the pruning techniques employed to eliminate the tuples are largely based on expensive pairwise tuple-to-tuple comparisons. On the other hand, most existing streaming methods focus on single stream skyline analysis, thus rendering these techniques unsuitable for applications that require a real-time ``join'' operation to be carried out before the skyline query can be answered. Based on these observations, we introduce and propose to demonstrate SkySuite: a framework of skyline-join operators that can be leveraged to efficiently process skyline-join queries over both static and stream environments. Among others, SkySuite includes (1) a novel Skyline-Sensitive Join (SSJ) operator that effectively processes skyline-join queries in static environments, and (2) a Layered Skyline-window-Join (LSJ) operator that incrementally maintains skyline-join results over stream environments.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhong:2013:PGP, author = "Jianlong Zhong and Bingsheng He", title = "Parallel graph processing on graphics processors made easy", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1270--1273", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper demonstrates Medusa, a programming framework for parallel graph processing on graphics processors (GPUs). Medusa enables developers to leverage the massive parallelism and other hardware features of GPUs by writing sequential C/C++ code for a small set of APIs. This simplifies the implementation of parallel graph processing on the GPU. The runtime system of Medusa automatically executes the user-defined APIs in parallel on the GPU, with a series of graph-centric optimizations based on the architecture features of GPUs. We will demonstrate the steps of developing GPU-based graph processing algorithms with Medusa, and the superior performance of Medusa with both real-world and synthetic datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Richter:2013:MAO, author = "Stefan Richter and Jens Dittrich and Stefan Schuh and Tobias Frey", title = "{Mosquito}: another one bites the data upload stream", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1274--1277", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Mosquito is a lightweight and adaptive physical design framework for Hadoop. Mosquito connects to existing data pipelines in Hadoop MapReduce and/or HDFS, observes the data, and creates better physical designs, i.e. indexes, as a byproduct. Our approach is minimally invasive, yet it allows users and developers to easily improve the runtime of Hadoop. We present three important use cases: first, how to create indexes as a byproduct of data uploads into HDFS; second, how to create indexes as a byproduct of map tasks; and third, how to execute map tasks as a byproduct of HDFS data uploads. These use cases may even be combined.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hardock:2013:NDS, author = "Sergej Hardock and Ilia Petrov and Robert Gottstein and Alejandro Buchmann", title = "{NoFTL}: database systems on {FTL}-less flash storage", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1278--1281", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The database architecture and workhorse algorithms have been designed to compensate for hard disk properties. The I/O characteristics of Flash memories have significant impact on database systems and many algorithms and approaches taking advantage of those have been proposed recently. Nonetheless on system level Flash storage devices are still treated as HDD compatible block devices, black boxes and fast HDD replacements. This backwards compatibility (both software and hardware) masks the native behaviour, incurs significant complexity and decreases I/O performance, making it non-robust and unpredictable. Database systems have a long tradition of operating directly on RAW storage natively, utilising the physical characteristics of storage media to improve performance. In this paper we demonstrate an approach called NoFTL that goes a step further. We show that allowing for native Flash access and integrating parts of the FTL functionality into the database system yields significant performance increase and simplification of the I/O stack. We created a real-time data-driven Flash emulator and integrated it accordingly into Shore-MT. We demonstrate a performance improvement of up to $ 3.7 \times $ compared to Shore-MT on RAW block-device Flash storage under various TPC workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kotsakos:2013:SUS, author = "Dimitrios Kotsakos and Panos Sakkos and Vana Kalogeraki and Dimitirios Gunopulos", title = "{SmartMonitor}: using smart devices to perform structural health monitoring", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1282--1285", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demonstration, we are presenting SmartMonitor, a distributed Structural Health Monitoring (SHM) system consisting of smart devices. Over the last few years, the vast majority of smart devices is equipped with accelerometers that can be utilized towards building SHM systems with hundreds of nodes. We describe a scalable, fault-tolerant communication protocol, that performs best-effort time synchronization of the nodes and is used to implement a decentralized version of the popular peak-picking SHM method. The implemented interactive system can be easily installed in any accelerometer-equipped Android device and the user has a number of options for configuring the system or analyzing the collected data and computed outcomes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kargin:2013:LEA, author = "Yag{\'\i}z Karg{\'\i}n and Milena Ivanova and Ying Zhang and Stefan Manegold and Martin Kersten", title = "{Lazy ETL} in action: {ETL} technology dates scientific data", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1286--1289", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Both scientific data and business data have analytical needs. Analysis takes place after a scientific data warehouse is eagerly filled with all data from external data sources (repositories). This is similar to the initial loading stage of Extract, Transform, and Load (ETL) processes that drive business intelligence. ETL can also help scientific data analysis. However, the initial loading is a time and resource consuming operation. It might not be entirely necessary, e.g. if the user is interested in only a subset of the data. We propose to demonstrate Lazy ETL, a technique to lower costs for initial loading. With it, ETL is integrated into the query processing of the scientific data warehouse. For a query, only the required data items are extracted, transformed, and loaded transparently on-the-fly. The demo is built around concrete implementations of Lazy ETL for seismic data analysis. The seismic data warehouse is ready for query processing, without waiting for long initial loading. The audience fires analytical queries to observe the internal mechanisms and modifications that realize each of the steps; lazy extraction, transformation, and loading.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dayan:2013:EED, author = "Niv Dayan and Martin Kj{\ae}r Svendsen and Matias Bj{\o}rling and Philippe Bonnet and Luc Bouganim", title = "{EagleTree}: exploring the design space of {SSD}-based algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1290--1293", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Solid State Drives (SSDs) are a moving target for system designers: they are black boxes, their internals are undocumented, and their performance characteristics vary across models. There is no appropriate analytical model and experimenting with commercial SSDs is cumbersome, as it requires a careful experimental methodology to ensure repeatability. Worse, performance results obtained on a given SSD cannot be generalized. Overall, it is impossible to explore how a given algorithm, say a hash join or LSM-tree insertions, leverages the intrinsic parallelism of a modern SSD, or how a slight change in the internals of an SSD would impact its overall performance. In this paper, we propose a new SSD simulation framework, named EagleTree, which addresses these problems, and enables a principled study of SSD-Based algorithms. The demonstration scenario illustrates the design space for algorithms based on an SSD-based IO stack, and shows how researchers and practitioners can use EagleTree to perform tractable explorations of this complex design space.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sathe:2013:EPQ, author = "Saket Sathe and Arthur Oviedo and Dipanjan Chakraborty and Karl Aberer", title = "{EnviroMeter}: a platform for querying community-sensed data", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1294--1297", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Efficiently querying data collected from Large-area Community driven Sensor Networks (LCSNs) is a new and challenging problem. In our previous works, we proposed adaptive techniques for learning models (e.g., statistical, nonparametric, etc.) from such data, considering the fact that LCSN data is typically geo-temporally skewed. In this paper, we present a demonstration of EnviroMeter. EnviroMeter uses our adaptive model creation techniques for processing continuous queries on community-sensed environmental pollution data. Subsequently, it efficiently pushes current pollution updates to GPS-enabled smartphones (through its Android application) or displays it via a web-interface. We experimentally demonstrate that our model-based query processing approach is orders of magnitude efficient than processing the queries over indexed raw data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Okcan:2013:SEA, author = "Alper Okcan and Mirek Riedewald and Biswanath Panda and Daniel Fink", title = "{Scolopax}: exploratory analysis of scientific data", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1298--1301", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The formulation of hypotheses based on patterns found in data is an essential component of scientific discovery. As larger and richer data sets become available, new scalable and user-friendly tools for scientific discovery through data analysis are needed. We demonstrate Scolopax, which explores the idea of a search engine for hypotheses. It has an intuitive user interface that supports sophisticated queries. Scolopax can explore a huge space of possible hypotheses, returning a ranked list of those that best match the user preferences. To scale to large and complex data sets, Scolopax relies on parallel data management and mining techniques. These include model training, efficient model summary generation, and novel parallel join techniques that together with traditional approaches such as clustering manipulate massive model-summary collections to find the most interesting hypotheses. This demonstration of Scolopax uses a real observational data set, provided by the Cornell Lab of Ornithology. It contains more than 3.3 million bird sightings reported by citizen scientists and has almost 2500 attributes. Conference attendees have the opportunity to make novel discoveries in this data set, ranging from identifying variables that strongly affect bird populations in specific regions to detecting more sophisticated patterns such as habitat competition and migration.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deutch:2013:PPA, author = "Daniel Deutch and Yuval Moskovitch and Val Tannen", title = "{PROPOLIS}: provisioned analysis of data-centric processes", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1302--1305", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We consider in this demonstration the (static) analysis of data-centric process-based applications, namely applications that depend on an underlying database and whose control is guided by a finite state transition system. We observe that analysts of such applications often want to do more than analyze a specific instance of the application's process control and database. In particular they want to interactively test and explore the effect on analysis results of different hypothetical modifications applied to the application's transition system and to the underlying database. To that end, we propose a demonstration of PROPOLIS, a system for PROvisioned PrOcess anaLysIS, namely analysis of data-centric processes under hypothetical modification scenarios. Our solution is based on the notion of a provisioned expression (which in turn is based on the notion of data provenance), namely an expression that captures, in a compact way, the analysis result with respect to all possible combinations of scenarios, and allows for their exploration at interactive speed. We will demonstrate PROPOLIS in the context of an online shopping application, letting participants play the role of analysts.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Konda:2013:FSE, author = "Pradap Konda and Arun Kumar and Christopher R{\'e} and Vaishnavi Sashikanth", title = "Feature selection in enterprise analytics: a demonstration using an {R}-based data analytics system", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1306--1309", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Enterprise applications are analyzing ever larger amounts of data using advanced analytics techniques. Recent systems from Oracle, IBM, and SAP integrate R with a data processing system to support richer advanced analytics on large data. A key step in advanced analytics applications is feature selection, which is often an iterative process that involves statistical algorithms and data manipulations. From our conversations with data scientists and analysts at enterprise settings, we observe three key aspects about feature selection. First, feature selection is performed by many types of users, not just data scientists. Second, high performance is critical to perform feature selection processes on large data. Third, the provenance of the results and steps in feature selection processes needs to be tracked for purposes of transparency and auditability. Based on our discussions with data scientists and the literature on feature selection practice, we organize a set of operations for feature selection into the Columbus framework. We prototype Columbus as a library usable in the Oracle R Enterprise environment. In this demonstration, we use Columbus to showcase how we can support various types of users of feature selection in one system. We then show how we optimize performance and manage the provenance of feature selection processes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Najafi:2013:FQP, author = "Mohammadreza Najafi and Mohammad Sadoghi and Hans-Arno Jacobsen", title = "Flexible query processor on {FPGAs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1310--1313", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this work, we demonstrate Flexible Query Processor (FQP), an online reconfigurable event stream query processor. FQP is an FPGA-based query processor that supports select, project and join queries over event streams at line rate. While processing incoming events, FQP can accept new query expressions, a key distinguishing characteristic from related approaches employing FPGAs for acceleration. Our solution aims to address performance limitations experienced with general purpose processors needing to operate at line rate and lack of on the fly reconfigurability with custom designed hardware solutions on FPGAs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Civili:2013:MSM, author = "Cristina Civili and Marco Console and Giuseppe {De Giacomo} and Domenico Lembo and Maurizio Lenzerini and Lorenzo Lepore and Riccardo Mancini and Antonella Poggi and Riccardo Rosati and Marco Ruzzi and Valerio Santarelli and Domenico Fabio Savo", title = "{MASTRO STUDIO}: managing ontology-based data access applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1314--1317", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Ontology-based data access (OBDA) is a novel paradigm for accessing large data repositories through an ontology, that is a formal description of a domain of interest. Supporting the management of OBDA applications poses new challenges, as it requires to provide effective tools for (i) allowing both expert and non-expert users to analyze the OBDA specification, (ii) collaboratively documenting the ontology, (iii) exploiting OBDA services, such as query answering and automated reasoning over ontologies, e.g., to support data quality check, and (iv) tuning the OBDA application towards optimized performances. To fulfill these challenges, we have built a novel system, called MASTRO STUDIO, based on a tool for automated reasoning over ontologies, enhanced with a suite of tools and optimization facilities for managing OBDA applications. To show the effectiveness of MASTRO STUDIO, we demonstrate its usage in one OBDA application developed in collaboration with the Italian Ministry of Economy and Finance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fuhry:2013:PHP, author = "David Fuhry and Yang Zhang and Venu Satuluri and Arnab Nandi and Srinivasan Parthasarathy", title = "{PLASMA-HD}: probing the lattice structure and makeup of high-dimensional data", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1318--1321", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Rapidly making sense of, analyzing, and extracting useful information from large and complex data is a grand challenge. A user tasked with meeting this challenge is often befuddled with questions on where and how to begin to understand the relevant characteristics of such data. Real-world problem scenarios often involve scalability limitations and time constraints. In this paper we present an incremental interactive data analysis system as a step to address this challenge. This system builds on recent progress in the fields of interactive data exploration, locality sensitive hashing, knowledge caching, and graph visualization. Using visual clues based on rapid incremental estimates, a user is provided a multi-level capability to probe and interrogate the intrinsic structure of data. Throughout the interactive process, the output of previous probes can be used to construct increasingly tight coherence estimates across the parameter space, providing strong hints to the user about promising analysis steps to perform next. We present examples, interactive scenarios, and experimental results on several synthetic and real-world datasets which show the effectiveness and efficiency of our approach. The implications of this work are quite broad and can impact fields ranging from top-$k$ algorithms to data clustering and from manifold learning to similarity search.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Moyers:2013:DIP, author = "Matthew Moyers and Emad Soroush and Spencer C. Wallace and Simon Krughoff and Jake Vanderplas and Magdalena Balazinska and Andrew Connolly", title = "A demonstration of iterative parallel array processing in support of telescope image analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1322--1325", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demonstration, we present AscotDB, a new tool for the analysis of telescope image data. AscotDB results from the integration of ASCOT, a Web-based tool for the collaborative analysis of telescope images and their metadata, and SciDB, a parallel array processing engine. We demonstrate the novel data exploration supported by this integrated tool on a 1 TB dataset comprising scientifically accurate, simulated telescope images. We also demonstrate novel iterative-processing features that we added to SciDB in order to support this use-case.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abdelhaq:2013:EOL, author = "Hamed Abdelhaq and Christian Sengstock and Michael Gertz", title = "{EvenTweet}: online localized event detection from {Twitter}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1326--1329", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Microblogging services such as Twitter, Facebook, and Foursquare have become major sources for information about real-world events. Most approaches that aim at extracting event information from such sources typically use the temporal context of messages. However, exploiting the location information of georeferenced messages, too, is important to detect localized events, such as public events or emergency situations. Users posting messages that are close to the location of an event serve as human sensors to describe an event. In this demonstration, we present a novel framework to detect localized events in real-time from a Twitter stream and to track the evolution of such events over time. For this, spatio-temporal characteristics of keywords are continuously extracted to identify meaningful candidates for event descriptions. Then, localized event information is extracted by clustering keywords according to their spatial similarity. To determine the most important events in a (recent) time frame, we introduce a scoring scheme for events. We demonstrate the functionality of our system, called Even-Tweet, using a stream of tweets from Europe during the 2012 UEFA European Football Championship.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mousavi:2013:ITM, author = "Hamid Mousavi and Shi Gao and Carlo Zaniolo", title = "{IBminer}: a text mining tool for constructing and populating {InfoBox} databases and knowledge bases", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1330--1333", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Knowledge bases and structured summaries are playing a crucial role in many applications, such as text summarization, question answering, essay grading, and semantic search. Although, many systems (e.g., DBpedia and YaGo2) provide massive knowledge bases of such summaries, they all suffer from incompleteness, inconsistencies, and inaccuracies. These problems can be addressed and much improved by combining and integrating different knowledge bases, but their very large sizes and their reliance on different terminologies and ontologies make the task very difficult. In this demo, we will demonstrate a system that is achieving good success on this task by: (i) employing available interlinks in the current knowledge bases (e.g. external link and redirect links in DBpedia) to combine information on individual entities, and (ii) using widely available text corpora (e.g. Wikipedia) and our IBminer text-mining system, to generate and verify structured information, and reconcile terminologies across different knowledge bases. We will also demonstrate two tools designed to support the integration process in close collaboration with IBminer. The first is the InfoBox Knowledge-Base Browser (IBKB) which provides structured summaries and their provenance, and the second is the InfoBox Editor (IBE), which is designed to suggest relevant attributes for a user-specified subject, whereby the user can easily improve the knowledge base without requiring any knowledge about the internal terminology of individual systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Farnan:2013:PPA, author = "Nicholas L. Farnan and Adam J. Lee and Panos K. Chrysanthis and Ting Yu", title = "{PAQO}: a preference-aware query optimizer for {PostgreSQL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1334--1337", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Although the declarative nature of SQL provides great utility to database users, its use in distributed database management systems can leave users unaware of which servers in the system are evaluating portions of their queries. By allowing users to merely say what data they are interested in accessing without providing guidance regarding how to retrieve it, query optimizers can generate plans with unintended consequences to the user (e.g., violating user privacy by revealing sensitive portions of a user's query to untrusted servers, or impacting result freshness by pulling data from stale data stores). To address these types of issues, we have created a framework that empowers users with the ability to specify constraints on the kinds of plans that can be produced by the optimizer to evaluate their queries. Such constraints are specified through an extended version of SQL that we have developed which we call PASQL. With this proposal, we aim to demonstrate PAQO, a version of PostgreSQL's query optimizer that we have modified to produce plans that respect constraints specified through PASQL while optimizing user-specified SQL queries in terms of performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bothe:2013:EPS, author = "Suvarna Bothe and Panagiotis Karras and Akrivi Vlachou", title = "{eSkyline}: processing skyline queries over encrypted data", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1338--1341", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The advent of cloud computing redefines the traditional query processing paradigm. Whereas computational overhead and memory constraints become less prohibitive, data privacy, security, and confidentiality concerns become top priorities. In particular, as data owners outsource the management of their data to service providers, query processing over such data has more resources to tap into, yet the data oftentimes has to be encrypted so as to prevent unauthorized access. The challenge that arises in such a setting is to devise an encryption scheme that still allows for query results to be efficiently computed using the encrypted data values. An important type of query that raises unconventional requirements in terms of the operator that has to be evaluated is the skyline query, which returns a set of objects in a dataset whose values are not dominated by any other object therein. In this demonstration, we present eSkyline, a prototype system and query interface that enables the processing of skyline queries over encrypted data, even without preserving the order on each attribute as order-preserving encryption would do. Our system comprises of an encryption scheme that facilitates the evaluation of domination relationships, hence allows for state-of-the-art skyline processing algorithms to be used. The actual data values are reconstructed only at the client side, where the encryption key is known. Our demo visualizes the details of the encryption scheme, allows a user to interact with a server, and showcases the efficiency of computing skyline queries and decrypting the results.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jiang:2013:GMD, author = "Lilong Jiang and Michael Mandel and Arnab Nandi", title = "{GestureQuery}: a multitouch database query interface", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1342--1345", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Multitouch interfaces allow users to directly and interactively manipulate data. We propose bringing such interactive manipulation to the task of querying SQL databases. This paper describes an initial implementation of such an interface for multitouch tablet devices called GestureQuery that translates multitouch gestures into database queries. It provides database users with immediate constructive feedback on their queries, allowing rapid iteration and refinement of those queries. Based on preliminary user studies, Gesture-Query is easier to use, and lets users construct target queries quicker than console-based SQL and visual query builders while maintaining interactive performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2013:MLP, author = "Di Yang and Kaiyu Zhao and Maryam Hasan and Hanyuan Lu and Elke Rundensteiner and Matthew Ward", title = "Mining and linking patterns across live data streams and stream archives", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1346--1349", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We will demonstrate the visual analytics system $V$ istream$^T$, that supports interactive mining of complex patterns within and across live data streams and stream pattern archives. Our system is equipped with both computational pattern mining and visualization techniques, which allow it to not only efficiently discover and manage patterns but also effectively convey the mining results to human analysts through visual displays. In our demonstration, we will illustrate that with $V$ istream$^T$, analysts can easily submit, monitor and interact with a broad range of query types for pattern mining. This includes novel strategies for extracting complex patterns from streams in real time, summarizing neighbour-based patterns using multi-resolution compression strategies, selectively pushing patterns into the stream archive, validating the popularity or rarity of stream patterns by stream archive matching, and pattern evolution tracking to link patterns across time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Samet:2013:PMQ, author = "Hanan Samet and Marco D. Adelfio and Brendan C. Fruin and Michael D. Lieberman and Jagan Sankaranarayanan", title = "{PhotoStand}: a map query interface for a database of news photos", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1350--1353", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "PhotoStand enables the use of a map query interface to retrieve news photos associated with news articles that are in turn associated with the principal locations that they mention collected as a result of monitoring the output of over 10,000 RSS news feeds, made available within minutes of publication, and stored in a PostgreSQL database. The news photos are ranked according to their relevance to the clusters of news articles associated with locations at which they are displayed. This work differs from traditional work in this field as the associated locations and topics (by virtue of the cluster with which the articles containing the news photos are associated) are generated automatically without any human intervention such as tagging, and that photos are retrieved by location instead of just by keyword as is the case for many existing systems. In addition, the clusters provide a filtering step for detecting near-duplicate news photos.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kumar:2013:HSH, author = "K. Ashwin Kumar and Jonathan Gluck and Amol Deshpande and Jimmy Lin", title = "{Hone}: {``Scaling} down'' {Hadoop} on shared-memory systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1354--1357", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The underlying assumption behind Hadoop and, more generally, the need for distributed processing is that the data to be analyzed cannot be held in memory on a single machine. Today, this assumption needs to be re-evaluated. Although petabyte-scale data-stores are increasingly common, it is unclear whether ``typical'' analytics tasks require more than a single high-end server. Additionally, we are seeing increased sophistication in analytics, e.g., machine learning, which generally operates over smaller and more refined datasets. To address these trends, we propose ``scaling down'' Hadoop to run on shared-memory machines. This paper presents a prototype runtime called Hone, intended to be both API and binary compatible with standard (distributed) Hadoop. That is, Hone can take an existing Hadoop jar and efficiently execute it, without modification, on a multi-core shared memory machine. This allows us to take existing Hadoop algorithms and find the most suitable run-time environment for execution on datasets of varying sizes. Our experiments show that Hone can be an order of magnitude faster than Hadoop pseudo-distributed mode (PDM); on dataset sizes that fit into memory, Hone can outperform a fully-distributed 15-node Hadoop cluster in some cases as well.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Antenucci:2013:RGN, author = "Dolan Antenucci and Erdong Li and Shaobo Liu and Bochun Zhang and Michael J. Cafarella and Christopher R{\'e}", title = "{Ringtail}: a generalized nowcasting system", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1358--1361", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Social media nowcasting--using online user activity to describe real-world phenomena--is an active area of research to supplement more traditional and costly data collection methods such as phone surveys. Given the potential impact of such research, we would expect general-purpose nowcasting systems to quickly become a standard tool among noncomputer scientists, yet it has largely remained a research topic. We believe a major obstacle to widespread adoption is the nowcasting feature selection problem. Typical nowcasting systems require the user to choose a handful of social media objects from a pool of billions of potential candidates, which can be a time-consuming and error-prone process. We have built RINGTAIL, a nowcasting system that helps the user by automatically suggesting high-quality signals. We demonstrate that RINGTALL can make nowcasting easier by suggesting relevant features for a range of topics. The user provides just a short topic query (e.g., unemployment) and a small conventional dataset in order for RINGTALL to quickly return a usable predictive nowcasting model.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xie:2013:IIP, author = "Min Xie and Laks V. S. Lakshmanan and Peter T. Wood", title = "{IPS}: an interactive package configuration system for trip planning", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1362--1365", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "When planning a trip, one essential task is to find a set of Places-of-Interest (POIs) which can be visited during the trip. Using existing travel guides or websites such as Lonely Planet and TripAdvisor, the user has to either manually work out a desirable set of POIs or take pre-configured travel packages; the former can be time consuming while the latter lacks flexibility. In this demonstration, we propose an Interactive Package configuration System (IPS), which visualizes different candidate packages on a map, and enables users to configure a travel package through simple interactions, i.e., comparing packages and fixing/removing POIs from a package. Compared with existing trip planning systems, we believe IPS strikes the right balance between flexibility and manual effort.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhou:2013:RDS, author = "Jingbo Zhou and Anthony K. H. Tung and Wei Wu and Wee Siong Ng", title = "{R2-D2}: a system to support probabilistic path prediction in dynamic environments via {``Semi-lazy''} learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1366--1369", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Path prediction is presently an important area of research with a wide range of applications. However, most of the existing path prediction solutions are based on eager learning methods which commit to a model or a set of patterns extracted from historical trajectories. Such methods do not perform very well in dynamic environments where the objects' trajectories are affected by many irregular factors which are not captured by pre-defined models or patterns. In this demonstration, we present the ``R2-D2'' system that supports probabilistic path prediction in dynamic environments. The core of our system is a ``semi-lazy'' learning approach to probabilistic path prediction which builds a prediction model on the fly using historical trajectories that are selected dynamically based on the trajectories of target objects. Our ``R2-D2'' system has a visual interface that shows how our path prediction algorithm works on several real-world datasets. It also allows us to experiment with various parameter settings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chun:2013:RRE, author = "Byung-Gon Chun and Tyson Condie and Carlo Curino and Chris Douglas and Sergiy Matusevych and Brandon Myers and Shravan Narayanamurthy and Raghu Ramakrishnan and Sriram Rao and Josh Rosen and Russell Sears and Markus Weimer", title = "{REEF}: retainable evaluator execution framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1370--1373", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demo proposal, we describe REEF, a framework that makes it easy to implement scalable, fault-tolerant runtime environments for a range of computational models. We will demonstrate diverse workloads, including extract-transform-load MapReduce jobs, iterative machine learning algorithms, and ad-hoc declarative query processing. At its core, REEF builds atop YARN (Apache Hadoop 2's resource manager) to provide retainable hardware resources with lifetimes that are decoupled from those of computational tasks. This allows us to build persistent (cross-job) caches and cluster-wide services, but, more importantly, supports high-performance iterative graph processing and machine learning algorithms. Unlike existing systems, REEF aims for composability of jobs across computational models, providing significant performance and usability gains, even with legacy code. REEF includes a library of interoperable data management primitives optimized for communication and data movement (which are distinct from storage locality). The library also allows REEF applications to access external services, such as user-facing relational databases. We were careful to decouple lower levels of REEF from the data models and semantics of systems built atop it. The result was two new standalone systems: Tang, a configuration manager and dependency injector, and Wake, a state-of-the-art event-driven programming and data movement framework. Both are language independent, allowing REEF to bridge the JVM and .NET.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2013:OTP, author = "Shuhao Zhang and Jiong He and Bingsheng He and Mian Lu", title = "{OmniDB}: towards portable and efficient query processing on parallel {CPU\slash GPU} architectures", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1374--1377", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Driven by the rapid hardware development of parallel CPU/GPU architectures, we have witnessed emerging relational query processing techniques and implementations on those parallel architectures. However, most of those implementations are not portable across different architectures, because they are usually developed from scratch and target at a specific architecture. This paper proposes a kernel-adapter based design (OmniDB), a portable yet efficient query processor on parallel CPU/GPU architectures. OmniDB attempts to develop an extensible query processing kernel (qKernel) based on an abstract model for parallel architectures, and to leverage an architecture-specific layer (adapter) to make qKernel be aware of the target architecture. The goal of OmniDB is to maximize the common functionality in qKernel so that the development and maintenance efforts for adapters are minimized across different architectures. In this demo, we demonstrate our initial efforts in implementing OmniDB, and present the preliminary results on the portability and efficiency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Savkovic:2013:CAI, author = "Ognjen Savkovi{\'c} and Paramita Mirza and Alex Tomasi and Werner Nutt", title = "Complete approximations of incomplete queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1378--1381", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present a system that computes for a query that may be incomplete, complete approximations from above and from below. We assume a setting where queries are posed over a partially complete database, that is, a database that is generally incomplete, but is known to contain complete information about specific aspects of its application domain. Which parts are complete, is described by a set of so-called table-completeness statements. Previous work led to a theoretical framework and an implementation that allowed one to determine whether in such a scenario a given conjunctive query is guaranteed to return a complete set of answers or not. With the present demonstrator we show how to reformulate the original query in such a way that answers are guaranteed to be complete. If there exists a more general complete query, there is a unique most specific one, which we find. If there exists a more specific complete query, there may even be infinitely many. In this case, we find the least specific specializations whose size is bounded by a threshold provided by the user. Generalizations are computed by a fixpoint iteration, employing an answer set programming engine. Specializations are found leveraging unification from logic programming.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Koutrika:2013:UAU, author = "Georgia Koutrika and Qian Lin and Jerry Liu", title = "User analytics with {UbeOne}: insights into web printing", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1382--1385", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As web and mobile applications become more sensitive to the user context, there is a shift from purely off-line processing of user actions (log analysis) to real-time user analytics that can generate information about the user context to be instantly leveraged by the application. Ubeone is a system that enables both real-time and aggregate analytics from user data. The system is designed as a set of lightweight, composeable mechanisms that can progressively and collectively analyze a user action, such as pinning, saving or printing a web page. We will demonstrate the system capabilities on analyzing a live feed of URLs printed through a proprietary, web browser plug-in. This is in fact the first analysis of web printing activity. We will also give a taste of how the system can enable instant recommendations based on the user context.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Santos:2013:DDS, author = "Ivo Santos and Marcel Tilly and Badrish Chandramouli and Jonathan Goldstein", title = "{DiAl}: distributed streaming analytics anywhere, anytime", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1386--1389", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Connected devices are expected to grow to 50 billion in 2020. Through our industrial partners and their use cases, we validated the importance of inflight data processing to produce results with low latency, in particular local and global data analytics capabilities. In order to cope with the scalability challenges posed by distributed streaming analytics scenarios, we propose two new technologies: (1) JStreams, a low footprint and efficient JavaScript complex event processing engine supporting local analytics on heterogeneous devices and (2) DiAlM, a distributed analytics management service that leverages cloud-edge evolving topologies. In the demonstration, based on a real manufacturing use case, we walk through a situation where operators supervise manufacturing equipment through global analytics, and drill down into alarm cases on the factory floor by locally inspecting the data generated by the manufacturing equipment.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chirkova:2013:BUW, author = "Rada Chirkova and Jun Yang", title = "Big and useful: what's in the data for me?", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1390--1391", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bartos:2013:UIA, author = "Tom{\'a}s Bartos", title = "Universal indexing of arbitrary similarity models", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1392--1397", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The increasing amount of available unstructured content together with the growing number of large nonrelational databases put more emphasis on the content-based retrieval and precisely on the area of similarity searching. Although there exist several indexing methods for efficient querying, not all of them are best-suited for arbitrary similarity models. Having a metric space, we can easily apply metric access methods but for nonmetric models which typically better describe similarities between generally unstructured objects the situation is a little bit more complicated. To address this challenge, we introduce SIMDEX, the universal framework that is capable of finding alternative indexing methods that will serve for efficient yet effective similarity searching for any similarity model. Using trivial or more advanced methods for the incremental exploration of possible indexing techniques, we are able to find alternative methods to the widely used metric space model paradigm. Through experimental evaluations, we validate our approach and show how it outperforms the known indexing methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bress:2013:WIT, author = "Sebastian Bre{\ss} and Gunter Saake", title = "Why it is time for a {HyPE}: a hybrid query processing engine for efficient {GPU} coprocessing in {DBMS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1398--1403", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "GPU acceleration is a promising approach to speed up query processing of database systems by using low cost graphic processors as coprocessors. Two major trends have emerged in this area: (1) The development of frameworks for scheduling tasks in heterogeneous CPU/GPU platforms, which is mainly in the context of coprocessing for applications and does not consider specifics of database-query processing and optimization. (2) The acceleration of database operations using efficient GPU algorithms, which typically cannot be applied easily on other database systems, because of their analytical-algorithm-specific cost models. One major challenge is how to combine traditional database query processing with GPU coprocessing techniques and efficient database operation scheduling in a GPU-aware query optimizer. In this thesis, we develop a hybrid query processing engine, which extends the traditional physical optimization process to generate hybrid query plans and to perform a cost-based optimization in a way that the advantages of CPUs and GPUs are combined. Furthermore, we aim at a portable solution between different GPU-accelerated database management systems to maximize applicability. Preliminary results indicate great potential.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mahdiraji:2013:DSU, author = "Alireza Rezaei Mahdiraji and Peter Baumann", title = "Database support for unstructured meshes", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1404--1409", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Despite ubiquitous usage of unstructured mesh in many application domains (e.g., computer aided design, scientific simulation, climate modeling, etc.), there is no specialized mesh database which supports storing and querying such data structures. Existing mesh libraries use file-based APIs which do not support declarative querying and are difficult to maintain. A mesh database can benefit these domains in several ways such as: declarative query language, ease of maintenance, query optimization, etc. In this thesis work, the core idea is to have a very general model which can represent objects from different domains and specialize it to smaller object classes using combinatorial constraints. We propose the Incidence multi-Graph Complex (ImG-Complex) data model for storing combinatorial aspect of meshes in a database. We extend incidence graph (IG) representation with multi-incidence information (ImG) to represent a class of objects which we call ImG-Complexes. ImG-Complex can support a wide range of application domains. We introduce optional and application-specific constraints to restrain the general ImG model to specific object classes or specific geometric representations. The constraints check validity of meshes based on the properties of the modeled object class. Finally, we show how graph databases can be utilized and reused to query some combinatorial mesh queries based on the (possibly constrained) ImG model. In particular, we show the strengths and limitations of a graph-only query language in expressing combinatorial mesh queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Madaan:2013:DSM, author = "Aastha Madaan and Subhash Bhalla", title = "Domain specific multistage query language for medical document repositories", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1410--1415", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Vast amount of medical information is increasingly available on the Web. As a result, seeking medical information through queries is gaining importance in the medical domain. The existing keyword-based search engines such as Google, Yahoo fail to suffice the needs of the health-care workers (who are well-versed with the domain knowledge required for querying) using these they often face results which are irrelevant and not useful for their tasks. In this paper, we present the need and the challenges for a user-level, domain-specific query language for the specialized document repositories of the medical domain. This topic has not been sufficiently addressed by the existing approaches including SQL-like query languages or general-purpose keyword-based search engines and document-level indexing based search. We aim to bridge the gap between information needs of the skilled/semi-skilled domain users and the query capability provided by the query language. Overcoming such a challenge can facilitate effective use of large volume of information on the Web (and in the electronic health records (EHRs)repositories).", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Taxidou:2013:RAI, author = "Io Taxidou and Peter Fischer", title = "Realtime analysis of information diffusion in social media", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1416--1421", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The goal of this thesis is to investigate real-time analysis methods on social media with a focus on information diffusion. From a conceptual point of view, we are interested both in the structural, sociological and temporal aspects of information diffusion in social media with a twist on the real time factor of what is happening right now. From a technical side, the sheer size of current social media services (100's of millions of users) and the large amount of data produced by these users renders conventional approaches for these costly analyses impossible. For that, we need to go beyond the state-of-the-art infrastructure for data-intensive computation. Our high level goal is to investigate how information diffuses in real time on the underlying social network and the role of different users in the propagation process. We plan to implement these analyses with full and partially missing datasets and compare the cost and quality of both approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bonomi:2013:MFP, author = "Luca Bonomi and Li Xiong", title = "Mining frequent patterns with differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1422--1427", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The mining of frequent patterns is a fundamental component in many data mining tasks. A considerable amount of research on this problem has led to a wide series of efficient and scalable algorithms for mining frequent patterns. However, releasing these patterns is posing concerns on the privacy of the users participating in the data. Indeed the information from the patterns can be linked with a large amount of data available from other sources creating opportunities for adversaries to break the individual privacy of the users and disclose sensitive information. In this proposal, we study the mining of frequent patterns in a privacy preserving setting. We first investigate the difference between sequential and itemset patterns, and second we extend the definition of patterns by considering the absence and presence of noise in the data. This leads us in distinguishing the patterns between exact and noisy. For exact patterns, we describe two novel mining techniques that we previously developed. The first approach has been applied in a privacy preserving record linkage setting, where our solution is used to mine frequent patterns which are employed in a secure transformation procedure to link records that are similar. The second approach improves the mining utility results using a two-phase strategy which allows to effectively mine frequent substrings as well as prefixes patterns. For noisy patterns, first we formally define the patterns according to the type of noise and second we provide a set of potential applications that require the mining of these patterns. We conclude the paper by stating the challenges in this new setting and possible future research directions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hoppe:2013:AOB, author = "Anett Hoppe and C. Nicolle and A. Roxin", title = "Automatic ontology-based user profile learning from heterogeneous {Web} resources in a big data context", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1428--1433", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Web has developed to the biggest source of information and entertainment in the world. By its size, its adaptability and flexibility, it challenged our current paradigms on information sharing in several areas. By offering everybody the opportunity to release own contents in a fast and cheap way, the Web already led to a revolution of the traditional publishing world and just now, it commences to change the perspective on advertisements. With the possibility to adapt the contents displayed on a page dynamically based on the viewer's context, campaigns launched to target rough customer groups will become an element of the past. However, this new ecosystem, that relates advertisements with the user, heavily relies on the quality of the underlying user profile. This profile has to be able to model any combination of user characteristics, the relations between its composing elements and the uncertainty that stems from the automated processing of real-world data. The work at hand describes the beginnings of a PhD project that aims to tackle those issues using a combination of data analysis, ontology engineering and processing of big data resources provided by an industrial partner. The final goal is to automatically construct and populate a profile ontology for each user identified by the system. This allows to associate these users to high-value audience segments in order to drive digital marketing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dey:2013:STA, author = "Akon Dey and Alan Fekete and Uwe R{\"o}hm", title = "Scalable transactions across heterogeneous {NoSQL} key--value data stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1434--1439", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many cloud systems provide data stores with limited features, especially they may not provide transactions, or else restrict transactions to a single item. We propose a approach that gives multi-item transactions across heterogeneous data stores, using only a minimal set of features from each store such as single item consistency, conditional update, and the ability to include extra metadata within a value. We offer a client-coordinated transaction protocol that does not need a central coordinating infrastructure. A prototype implementation has been built as a Java library and measured with an extension of YCSB benchmark to exercise multi-item transactions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ngo:2013:GUS, author = "Nhung Ngo and Enrico Franconi", title = "Getting unique solution in data exchange", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1440--1443", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A schema mapping is a high-level specification in which the relationship between two database schemas is described. In data exchange, schema mappings are one-way mappings that describe which data can be brought from source data to target data. Therefore, given a source instance and a mapping, there might be more than one valid target instance. This fact causes many problems in query answering over target data for non-conjunctive queries. To make query answering feasible for all queries, we focus on a methodology for extending the original schema mapping to guarantee the uniqueness of target instance corresponding to a source instance. To this end, we introduce a theoretical framework where the problem is transformed to an abduction problem, namely, definability abduction. We apply the framework to relational data exchange setting and solve the problem by pointing out minimal solutions according to a specific semantic minimality criterion.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kaufmann:2013:SPT, author = "Martin Kaufmann and Donald Kossmann", title = "Storing and processing temporal data in a main memory column store", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1444--1449", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Managing and accessing temporal data is of increasing importance in industry. So far, most companies model the time dimension on the application layer rather than pushing down the operators to the database, which leads to a significant performance overhead. The goal of this PhD thesis is to develop a native support of temporal features for SAP HANA, which is a commercial in-memory column store database system. We investigate different alternatives to store temporal data physically and analyze the trade-offs arising from different memory layouts which cluster the data either by time or by space dimension. Taking into account the underlying physical representation, different temporal operators such as temporal aggregation, time travel and temporal join have to be executed efficiently. We present a novel data structure called Timeline Index and algorithms based on this index, which have a very competitive performance for all temporal operators beating existing best-of-breed approaches by factors, sometimes even by orders of magnitude. The results of this thesis are currently being integrated into HANA, with the goal of being shipped to the customers as a productive release within the next few months.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kozak:2013:ESS, author = "Stepan Kozak and Pavel Zezula", title = "Efficiency and security in similarity cloud services", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1450--1455", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With growing popularity of cloud services, the trend in the industry is to outsource the data to a 3rd party system that provides searching in the data as a service. This approach naturally brings privacy concerns about the (potentially sensitive) data. Recently, quite extensive research of outsourcing classic exact-match or keyword search has been done. However, not much attention has been paid to the outsourcing of the similarity search, which becomes more and more important in information retrieval applications. In this work, we propose to the research community a model of outsourcing similarity search to the cloud environment (so called similarity cloud). We establish privacy and efficiency requirements to be laid down for the similarity cloud with an emphasis on practical use of the system in real applications; this requirement list can be used as a general guideline for practical system analysis and we use it to analyze current existing approaches. We propose two new similarity indexes that ensure data privacy and thus are suitable for search systems outsourced in a cloud. The balance of the first proposed technique EM-Index is more on the efficiency side while the other (DSH Index) shifts this balance more to the privacy side.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sellam:2013:FCD, author = "Thibault Sellam and Martin Kersten", title = "Fast cartography for data explorers", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "12", pages = "1456--1461", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:00 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Exploration is the act of investigating unknown regions. An analyst exploring a database cannot, by definition, compose the right query or use the appropriate data mining algorithm. However, current data management tools cannot operate without well defined instructions. Therefore, browsing an unknown database can be a very tedious process. Our project, Atlas, is an attempt to circumvent this problem. Atlas is an active DBMS front-end, designed for database exploration. It generates and ranks several data maps from a user query. A data map is a small set of database queries (less than a dozen), in which each query describes an interesting region of the database. The user can pick one and submit it for further exploration. In order to support interaction, the system should operate in quasi-real time, possibly at the cost of precision, and require as little input parameters as possible. We draft a framework to generate such data maps, and introduce several short-to long-terms research problems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Simoes:2013:WSP, author = "Gon{\c{c}}alo Sim{\~o}es and Helena Galhardas and Luis Gravano", title = "When speed has a price: fast information extraction using approximate algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "13", pages = "1462--1473", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:09 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A wealth of information produced by individuals and organizations is expressed in natural language text. This is a problem since text lacks the explicit structure that is necessary to support rich querying and analysis. Information extraction systems are sophisticated software tools to discover structured information in natural language text. Unfortunately, information extraction is a challenging and time-consuming task. In this paper, we address the limitations of state-of-the-art systems for the optimization of information extraction programs, with the objective of producing efficient extraction executions. Our solution relies on exploiting a wide range of optimization opportunities. For efficiency, we consider a wide spectrum of execution plans, including approximate plans whose results differ in their precision and recall. Our optimizer accounts for these characteristics of the competing execution plans, and uses accurate predictors of their extraction time, recall, and precision. We demonstrate the efficiency and effectiveness of our optimizer through a large-scale experimental evaluation over real-world datasets and multiple extraction tasks and approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chasseur:2013:DES, author = "Craig Chasseur and Jignesh M. Patel", title = "Design and evaluation of storage organizations for read-optimized main memory databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "13", pages = "1474--1485", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:09 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Existing main memory data processing systems employ a variety of storage organizations and make a number of storage-related design choices. The focus of this paper is on systematically evaluating a number of these key storage design choices for main memory analytical (i.e. read-optimized) database settings. Our evaluation produces a number of key insights: First, it is always beneficial to organize data into self-contained memory blocks rather than large files. Second, both column-stores and row-stores display performance advantages for different types of queries, and for high performance both should be implemented as options for the tuple-storage layout. Third, cache-sensitive B+-tree indices can play a major role in accelerating query performance, especially when used in a block-oriented organization. Finally, compression can also play a role in accelerating query performance depending on data distribution and query selectivity.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2013:ASA, author = "Luying Chen and Stefano Ortona and Giorgio Orsi and Michael Benedikt", title = "Aggregating semantic annotators", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "13", pages = "1486--1497", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:09 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A growing number of resources are available for enriching documents with semantic annotations. While originally focused on a few standard classes of annotations, the ecosystem of annotators is now becoming increasingly diverse. Although annotators often have very different vocabularies, with both high-level and specialist concepts, they also have many semantic interconnections. We will show that both the overlap and the diversity in annotator vocabularies motivate the need for semantic annotation integration: middleware that produces a unified annotation on top of diverse semantic annotators. On the one hand, the diversity of vocabulary allows applications to benefit from the much richer vocabulary available in an integrated vocabulary. On the other hand, we present evidence that the most widely-used annotators on the web suffer from serious accuracy deficiencies: the overlap in vocabularies from individual annotators allows an integrated annotator to boost accuracy by exploiting inter-annotator agreement and disagreement. The integration of semantic annotations leads to new challenges, both compared to usual data integration scenarios and to standard aggregation of machine learning tools. We overview an approach to these challenges that performs ontology-aware aggregation. We introduce an approach that requires no training data, making use of ideas from database repair. We experimentally compare this with a supervised approach, which adapts maximal entropy Markov models to the setting of ontology-based annotations. We further experimentally compare both these approaches with respect to ontology-unaware supervised approaches, and to individual annotators.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chu:2013:DDC, author = "Xu Chu and Ihab F. Ilyas and Paolo Papotti", title = "Discovering denial constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "13", pages = "1498--1509", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:09 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Integrity constraints (ICs) provide a valuable tool for enforcing correct application semantics. However, designing ICs requires experts and time. Proposals for automatic discovery have been made for some formalisms, such as functional dependencies and their extension conditional functional dependencies. Unfortunately, these dependencies cannot express many common business rules. For example, an American citizen cannot have lower salary and higher tax rate than another citizen in the same state. In this paper, we tackle the challenges of discovering dependencies in a more expressive integrity constraint language, namely Denial Constraints (DCs). DCs are expressive enough to overcome the limits of previous languages and, at the same time, have enough structure to allow efficient discovery and application in several scenarios. We lay out theoretical and practical foundations for DCs, including a set of sound inference rules and a linear algorithm for implication testing. We then develop an efficient instance-driven DC discovery algorithm and propose a novel scoring function to rank DCs for user validation. Using real-world and synthetic datasets, we experimentally evaluate scalability and effectiveness of our solution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2013:DTK, author = "Wenfei Fan and Xin Wang and Yinghui Wu", title = "Diversified top-$k$ graph pattern matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "13", pages = "1510--1521", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:09 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph pattern matching has been widely used in e.g., social data analysis. A number of matching algorithms have been developed that, given a graph pattern $Q$ and a graph $G$, compute the set $ M(Q, G) $ of matches of $Q$ in $G$. However, these algorithms often return an excessive number of matches, and are expensive on large real-life social graphs. Moreover, in practice many social queries are to find matches of a specific pattern node, rather than the entire $ M(Q, G) $. This paper studies top- $k$ graph pattern matching. (1) We revise graph pattern matching defined in terms of simulation, by supporting a designated output node $ u o $. Given $G$ and $Q$, it is to find those nodes in $ M(Q, G) $ that match $ u o $, instead of the large set $ M(Q, G) $. (2) We study two classes of functions for ranking the matches: relevance functions $ \delta r() $ based on, e.g., social impact, and distance functions $ \delta d() $ to cover diverse elements. (3) We develop two algorithms for computing top-$k$ matches of $ u o $ based on $ \delta r() $, with the early termination property, i.e., they find top-$k$ matches without computing the entire $ M(Q, G) $. (4) We also study diversified top-$k$ matching, a bi-criteria optimization problem based on both $ \delta r() $ and $ \delta d() $. We show that its decision problem is NP-complete. Nonetheless, we provide an approximation algorithm with performance guarantees and a heuristic one with the early termination property. (5) Using real-life and synthetic data, we experimentally verify that our (diversified) top-$k$ matching algorithms are effective, and outperform traditional matching algorithms in efficiency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rao:2013:BNF, author = "Weixiong Rao and Lei Chen and Pan Hui and Sasu Tarkoma", title = "{Bitlist}: new full-text index for low space cost and efficient keyword search", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "13", pages = "1522--1533", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:09 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Nowadays Web search engines are experiencing significant performance challenges caused by a huge amount of Web pages and increasingly larger number of Web users. The key issue for addressing these challenges is to design a compact structure which can index Web documents with low space and meanwhile process keyword search very fast. Unfortunately, the current solutions typically separate the space optimization from the search improvement. As a result, such solutions either save space yet with search inefficiency, or allow fast keyword search but with huge space requirement. In this paper, to address the challenges, we propose a novel structure bitlist with both low space requirement and supporting fast keyword search. Specifically, based on a simple and yet very efficient encoding scheme, bitlist uses a single number to encode a set of integer document IDs for low space, and adopts fast bitwise operations for very efficient boolean-based keyword search. Our extensive experimental results on real and synthetic data sets verify that bitlist outperforms the recent proposed solution, inverted list compression [23, 22] by spending 36.71\% less space and 61.91\% faster processing time, and achieves comparable running time as [8] but with significantly lower space.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wandelt:2013:RSS, author = "Sebastian Wandelt and Johannes Starlinger and Marc Bux and Ulf Leser", title = "{RCSI}: scalable similarity search in thousand(s) of genomes", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "13", pages = "1534--1545", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:09 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Until recently, genomics has concentrated on comparing sequences between species. However, due to the sharply falling cost of sequencing technology, studies of populations of individuals of the same species are now feasible and promise advances in areas such as personalized medicine and treatment of genetic diseases. A core operation in such studies is read mapping, i.e., finding all parts of a set of genomes which are within edit distance $k$ to a given query sequence ($k$-approximate search). To achieve sufficient speed, current algorithms solve this problem only for one to-be-searched genome and compute only approximate solutions, i.e., they miss some $k$ --- approximate occurrences. We present RCSI, Referentially Compressed Search Index, which scales to a thousand genomes and computes the exact answer. It exploits the fact that genomes of different individuals of the same species are highly similar by first compressing the to-be-searched genomes with respect to a reference genome. Given a query, RCSI then searches the reference and all genome-specific individual differences. We propose efficient data structures for representing compressed genomes and present algorithms for scalable compression and similarity search. We evaluate our algorithms on a set of 1092 human genomes, which amount to approx. 3 TB of raw data. RCSI compresses this set by a ratio of 450:1 (26:1 including the search index) and answers similarity queries on a mid-class server in 15 ms on average even for comparably large error thresholds, thereby significantly outperforming other methods. Furthermore, we present a fast and adaptive heuristic for choosing the best reference sequence for referential compression, a problem that was never studied before at this scale.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tao:2013:AMS, author = "Yufei Tao and Xiaocheng Hu and Dong-Wan Choi and Chin-Wan Chung", title = "Approximate {MaxRS} in spatial databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "13", pages = "1546--1557", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:09 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the maximizing range sum (MaxRS) problem, given (i) a set $P$ of $2$D points each of which is associated with a positive weight, and (ii) a rectangle $r$ of specific extents, we need to decide where to place $r$ in order to maximize the covered weight of $r$ --- that is, the total weight of the data points covered by $r$. Algorithms solving the problem exactly entail expensive CPU or I/O cost. In practice, exact answers are often not compulsory in a MaxRS application, where slight imprecision can often be comfortably tolerated, provided that approximate answers can be computed considerably faster. Motivated by this, the present paper studies the $ (1 - \epsilon) $-approximate MaxRS problem, which admits the same inputs as MaxRS, but aims instead to return a rectangle whose covered weight is at least $ (1 - \epsilon) m^* $, where $ m^* $ is the optimal covered weight, and $ \epsilon $ can be an arbitrarily small constant between $0$ and $1$. We present fast algorithms that settle this problem with strong theoretical guarantees.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kimelfeld:2013:MTD, author = "Benny Kimelfeld and Jan Vondr{\'a}k and David P. Woodruff", title = "Multi-tuple deletion propagation: approximations and complexity", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "13", pages = "1558--1569", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:09 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper studies the computational complexity of the classic problem of deletion propagation in a relational database, where tuples are deleted from the base relations in order to realize a desired deletion of tuples from the view. Such an operation may result in a (sometimes unavoidable) side effect: deletion of additional tuples from the view, besides the intentionally deleted ones. The goal is to minimize the side effect. The complexity of this problem has been well studied in the case where only a single tuple is deleted from the view. However, only little is known within the more realistic scenario of multi-tuple deletion, which is the topic of this paper. The class of conjunctive queries (CQs) is among the most well studied in the literature, and we focus here on views defined by CQs that are self-join free (sjf-CQs). Our main result is a trichotomy in complexity, classifying all sjf-CQs into three categories: those for which the problem is in polynomial time, those for which the problem is NP-hard but polynomial-time approximable (by a constant-factor), and those for which even an approximation (by any factor) is NP-hard to obtain. A corollary of this trichotomy is a dichotomy in the complexity of deciding whether a side-effect-free solution exists, in the multi-tuple case. We further extend the full classification to accommodate the presence of a constant upper bound on the number of view tuples to delete, and the presence of functional dependencies. Finally, we establish (positive and negative) complexity results on approximability for the dual problem of maximizing the number of view tuples surviving (rather than minimizing the side effect incurred in) the deletion propagation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chandramouli:2013:SDF, author = "Badrish Chandramouli and Suman Nath and Wenchao Zhou", title = "Supporting distributed feed-following apps over edge devices", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "13", pages = "1570--1581", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:09 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In feed-following applications such as Twitter and Facebook, users (consumers) follow a large number of other users (producers) to get personalized feeds, generated by blending producers- feeds. With the proliferation of Cloud-connected smart edge devices such as smartphones, producers and consumers of many feed-following applications reside on edge devices and the Cloud. An important design goal of such applications is to minimize communication (and energy) overhead of edge devices. In this paper, we abstract distributed feed-following applications as a view maintenance problem, with the goal of optimally placing the views on edge devices and in the Cloud to minimize communication overhead between edge devices and the Cloud. The view placement problem for general network topology is NP Hard; however, we show that for the special case of Cloud-edge topology, locally optimal solutions yield a globally optimal view placement solution. Based on this powerful result, we propose view placement algorithms that are highly efficient, yet provably minimize global network cost. Compared to existing works on feed-following applications, our algorithms are more general--they support views with selection, projection, correlation (join) and arbitrary black-box operators, and can even refer to other views. We have implemented our algorithms within a distributed feed-following architecture over real smartphones and the Cloud. Experiments over real datasets indicate that our algorithms are highly scalable and orders-of-magnitude more efficient than existing strategies for optimal placement. Further, our results show that optimal placements generated by our algorithms are often several factors better than simpler schemes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Thirumuruganathan:2013:RDW, author = "Saravanan Thirumuruganathan and Nan Zhang and Gautam Das", title = "Rank discovery from web databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "13", pages = "1582--1593", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:09 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many web databases are only accessible through a proprietary search interface which allows users to form a query by entering the desired values for a few attributes. After receiving a query, the system returns the top-$k$ matching tuples according to a pre-determined ranking function. Since the rank of a tuple largely determines the attention it receives from website users, ranking information for any tuple --- not just the top-ranked ones --- is often of significant interest to third parties such as sellers, customers, market researchers and investors. In this paper, we define a novel problem of rank discovery over hidden web databases. We introduce a taxonomy of ranking functions, and show that different types of ranking functions require fundamentally different approaches for rank discovery. Our technical contributions include principled and efficient randomized algorithms for estimating the rank of a given tuple, as well as negative results which demonstrate the inefficiency of any deterministic algorithm. We show extensive experimental results over real-world databases, including an online experiment at Amazon.com, which illustrates the effectiveness of our proposed techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rekatsinas:2013:SPS, author = "Theodoros Rekatsinas and Amol Deshpande and Ashwin Machanavajjhala", title = "{SPARSI}: partitioning sensitive data amongst multiple adversaries", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "13", pages = "1594--1605", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:09 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present SPARSI, a novel theoretical framework for partitioning sensitive data across multiple non-colluding adversaries. Most work in privacy-aware data sharing has considered disclosing summaries where the aggregate information about the data is preserved, but sensitive user information is protected. Nonetheless, there are applications, including online advertising, cloud computing and crowdsourcing markets, where detailed and fine-grained user data must be disclosed. We consider a new data sharing paradigm and introduce the problem of privacy-aware data partitioning, where a sensitive dataset must be partitioned among $k$ untrusted parties (adversaries). The goal is to maximize the utility derived by partitioning and distributing the dataset, while minimizing the total amount of sensitive information disclosed. The data should be distributed so that an adversary, without colluding with other adversaries, cannot draw additional inferences about the private information, by linking together multiple pieces of information released to her. The assumption of no collusion is both reasonable and necessary in the above application domains that require release of private user information. SPARSI enables us to formally define privacy-aware data partitioning using the notion of sensitive properties for modeling private information and a hypergraph representation for describing the interdependencies between data entries and private information. We show that solving privacy-aware partitioning is, in general, NP-hard, but for specific information disclosure functions, good approximate solutions can be found using relaxation techniques. Finally, we present a local search algorithm applicable to generic information disclosure functions. We conduct a rigorous performance evaluation with real-world and synthetic datasets that illustrates the effectiveness of SPARSI at partitioning sensitive data while minimizing disclosure.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deng:2013:SCC, author = "Dong Deng and Yu Jiang and Guoliang Li and Jian Li and Cong Yu", title = "Scalable column concept determination for {Web} tables using large knowledge bases", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "13", pages = "1606--1617", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:09 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Tabular data on the Web has become a rich source of structured data that is useful for ordinary users to explore. Due to its potential, tables on the Web have recently attracted a number of studies with the goals of understanding the semantics of those Web tables and providing effective search and exploration mechanisms over them. An important part of table understanding and search is column concept determination, i.e., identifying the most appropriate concepts associated with the columns of the tables. The problem becomes especially challenging with the availability of increasingly rich knowledge bases that contain hundreds of millions of entities. In this paper, we focus on an important instantiation of the column concept determination problem, namely, the concepts of a column are determined by fuzzy matching its cell values to the entities within a large knowledge base. We provide an efficient and scalable MapReduce-based solution that is scalable to both the number of tables and the size of the knowledge base and propose two novel techniques: knowledge concept aggregation and knowledge entity partition. We prove that both the problem of finding the optimal aggregation strategy and that of finding the optimal partition strategy are NP-hard, and propose efficient heuristic techniques by leveraging the hierarchy of the knowledge base. Experimental results on real-world datasets show that our method achieves high annotation quality and performance, and scales well.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2013:TKS, author = "Xin Huang and Hong Cheng and Rong-Hua Li and Lu Qin and Jeffrey Xu Yu", title = "top-$k$ structural diversity search in large networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "13", pages = "1618--1629", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:09 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Social contagion depicts a process of information (e.g., fads, opinions, news) diffusion in the online social networks. A recent study reports that in a social contagion process the probability of contagion is tightly controlled by the number of connected components in an individual's neighborhood. Such a number is termed structural diversity of an individual and it is shown to be a key predictor in the social contagion process. Based on this, a fundamental issue in a social network is to find top-$k$ users with the highest structural diversities. In this paper, we, for the first time, study the top-$k$ structural diversity search problem in a large network. Specifically, we develop an effective upper bound of structural diversity for pruning the search space. The upper bound can be incrementally refined in the search process. Based on such upper bound, we propose an efficient framework for top-$k$ structural diversity search. To further speed up the structural diversity evaluation in the search process, several carefully devised heuristic search strategies are proposed. Extensive experimental studies are conducted in 13 real-world large networks, and the results demonstrate the efficiency and effectiveness of the proposed methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cavalieri:2013:SCX, author = "Federico Cavalieri and Alessandro Solimando and Giovanna Guerrini", title = "Synthetising changes in {XML} documents as {PULs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "13", pages = "1630--1641", month = aug, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:09 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The ability of efficiently detecting changes in XML documents is crucial in many application contexts. If such changes are represented as XQuery Update Pending Update Lists (PULs), they can then be applied on documents using XQuery Update engines, and document management can take advantage of existing composition, inversion, reconciliation approaches developed in the update processing context. The paper presents an XML edit-script generator with the unique characteristic of using PULs as edit-script language and improving the state of the art from both the performance and the generated edit-script quality perspectives.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2013:PQR, author = "Lei Zhang and Thanh Tran and Achim Rettinger", title = "Probabilistic query rewriting for efficient and effective keyword search on graph data", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1642--1653", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The problem of rewriting keyword search queries on graph data has been studied recently, where the main goal is to clean user queries by rewriting keywords as valid tokens appearing in the data and grouping them into meaningful segments. The main solution to this problem employs heuristics for ranking query rewrites and a dynamic programming algorithm for computing them. Based on a broader set of queries defined by an existing benchmark, we show that the use of these heuristics does not yield good results. We propose a novel probabilistic framework, which enables the optimality of a query rewrite to be estimated in a more principled way. We show that our approach outperforms existing work in terms of effectiveness and efficiency of query rewriting. More importantly, we provide the first results indicating query rewriting can indeed improve overall keyword search runtime performance and result quality.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Schaler:2013:QBH, author = "Martin Sch{\"a}ler and Alexander Grebhahn and Reimar Schr{\"o}ter and Sandro Schulze and Veit K{\"o}ppen and Gunter Saake", title = "{QuEval}: beyond high-dimensional indexing {\`a} la carte", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1654--1665", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the recent past, the amount of high-dimensional data, such as feature vectors extracted from multimedia data, increased dramatically. A large variety of indexes have been proposed to store and access such data efficiently. However, due to specific requirements of a certain use case, choosing an adequate index structure is a complex and time-consuming task. This may be due to engineering challenges or open research questions. To overcome this limitation, we present QuEval, an open-source framework that can be flexibly extended w.r.t. index structures, distance metrics, and data sets. QuEval provides a unified environment for a sound evaluation of different indexes, for instance, to support tuning of indexes. In an empirical evaluation, we show how to apply our framework, motivate benefits, and demonstrate analysis possibilities.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2013:DLL, author = "Yuhong Li and Leong Hou U. and Man Lung Yiu and Zhiguo Gong", title = "Discovering longest-lasting correlation in sequence databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1666--1677", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Most existing work on sequence databases use correlation (e.g., Euclidean distance and Pearson correlation) as a core function for various analytical tasks. Typically, it requires users to set a length for the similarity queries. However, there is no steady way to define the proper length on different application needs. In this work we focus on discovering longest-lasting highly correlated subsequences in sequence databases, which is particularly useful in helping those analyses without prior knowledge about the query length. Surprisingly, there has been limited work on this problem. A baseline solution is to calculate the correlations for every possible subsequence combination. Obviously, the brute force solution is not scalable for large datasets. In this work we study a space-constrained index that gives a tight correlation bound for subsequences of similar length and offset by intra-object grouping and inter-object grouping techniques. To the best of our knowledge, this is the first index to support normalized distance metric of arbitrary length subsequences. Extensive experimental evaluation on both real and synthetic sequence datasets verifies the efficiency and effectiveness of our proposed methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Popescu:2013:PTP, author = "Adrian Daniel Popescu and Andrey Balmin and Vuk Ercegovac and Anastasia Ailamaki", title = "{PREDIcT}: towards predicting the runtime of large scale iterative analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1678--1689", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Machine learning algorithms are widely used today for analytical tasks such as data cleaning, data categorization, or data filtering. At the same time, the rise of social media motivates recent uptake in large scale graph processing. Both categories of algorithms are dominated by iterative subtasks, i.e., processing steps which are executed repetitively until a convergence condition is met. Optimizing cluster resource allocations among multiple workloads of iterative algorithms motivates the need for estimating their runtime, which in turn requires: (i) predicting the number of iterations, and (ii) predicting the processing time of each iteration. As both parameters depend on the characteristics of the dataset and on the convergence function, estimating their values before execution is difficult. This paper proposes PREDIcT, an experimental methodology for predicting the runtime of iterative algorithms. PREDIcT uses sample runs for capturing the algorithm's convergence trend and per-iteration key input features that are well correlated with the actual processing requirements of the complete input dataset. Using this combination of characteristics we predict the runtime of iterative algorithms, including algorithms with very different runtime patterns among subsequent iterations. Our experimental evaluation of multiple algorithms on scale-free graphs shows a relative prediction error of 10\%--30\% for predicting runtime, including algorithms with up to $ 100 \times $ runtime variability among consecutive iterations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhao:2013:ERW, author = "Xiaohan Zhao and Adelbert Chang and Atish Das Sarma and Haitao Zheng and Ben Y. Zhao", title = "On the embeddability of random walk distances", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1690--1701", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Analysis of large graphs is critical to the ongoing growth of search engines and social networks. One class of queries centers around node affinity, often quantified by random-walk distances between node pairs, including hitting time, commute time, and personalized PageRank (PPR). Despite the potential of these ``metrics,'' they are rarely, if ever, used in practice, largely due to extremely high computational costs. In this paper, we investigate methods to scalably and efficiently compute random-walk distances, by ``embedding'' graphs and distances into points and distances in geometric coordinate spaces. We show that while existing graph coordinate systems (GCS) can accurately estimate shortest path distances, they produce significant errors when embedding random-walk distances. Based on our observations, we propose a new graph embedding system that explicitly accounts for per-node graph properties that affect random walk. Extensive experiments on a range of graphs show that our new approach can accurately estimate both symmetric and asymmetric random-walk distances. Once a graph is embedded, our system can answer queries between any two nodes in 8 microseconds, orders of magnitude faster than existing methods. Finally, we show that our system produces estimates that can replace ground truth in applications with minimal impact on application output.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Muhlbauer:2013:ILM, author = "Tobias M{\"u}hlbauer and Wolf R{\"o}diger and Robert Seilbeck and Angelika Reiser and Alfons Kemper and Thomas Neumann", title = "Instant loading for main memory databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1702--1713", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "eScience and big data analytics applications are facing the challenge of efficiently evaluating complex queries over vast amounts of structured text data archived in network storage solutions. To analyze such data in traditional disk-based database systems, it needs to be bulk loaded, an operation whose performance largely depends on the wire speed of the data source and the speed of the data sink, i.e., the disk. As the speed of network adapters and disks has stagnated in the past, loading has become a major bottleneck. The delays it is causing are now ubiquitous as text formats are a preferred storage format for reasons of portability. But the game has changed: Ever increasing main memory capacities have fostered the development of in-memory database systems and very fast network infrastructures are on the verge of becoming economical. While hardware limitations for fast loading have disappeared, current approaches for main memory databases fail to saturate the now available wire speeds of tens of Gbit/s. With Instant Loading, we contribute a novel CSV loading approach that allows scalable bulk loading at wire speed. This is achieved by optimizing all phases of loading for modern super-scalar multi-core CPUs. Large main memory capacities and Instant Loading thereby facilitate a very efficient data staging processing model consisting of instantaneous load-work-unload cycles across data archives on a single node. Once data is loaded, updates and queries are efficiently processed with the flexibility, security, and high performance of relational main memory databases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Alexiou:2013:ARF, author = "Karolina Alexiou and Donald Kossmann and Per-{\AA}ke Larson", title = "Adaptive range filters for cold data: avoiding trips to {Siberia}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1714--1725", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Bloom filters are a great technique to test whether a key is not in a set of keys. This paper presents a novel data structure called ARF. In a nutshell, ARFs are for range queries what Bloom filters are for point queries. That is, an ARF can determine whether a set of keys does not contain any keys that are part of a specific range. This paper describes the principles and methods for efficient implementation of ARFs and presents the results of comprehensive experiments that assess the precision, space, and latency of ARFs. Furthermore, this paper shows how ARFs can be applied to a commercial database system that partitions data into hot and cold regions to optimize queries that involve only hot data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chandramouli:2013:SPA, author = "Badrish Chandramouli and Jonathan Goldstein and Abdul Quamar", title = "Scalable progressive analytics on big data in the {Cloud}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1726--1737", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Analytics over the increasing quantity of data stored in the Cloud has become very expensive, particularly due to the pay-as-you-go Cloud computation model. Data scientists typically manually extract samples of increasing data size (progressive samples) using domain-specific sampling strategies for exploratory querying. This provides them with user-control, repeatable semantics, and result provenance. However, such solutions result in tedious workflows that preclude the reuse of work across samples. On the other hand, existing approximate query processing systems report early results, but do not offer the above benefits for complex ad-hoc queries. We propose a new progressive analytics system based on a progress model called Prism that (1) allows users to communicate progressive samples to the system; (2) allows efficient and deterministic query processing over samples; and (3) provides repeatable semantics and provenance to data scientists. We show that one can realize this model for atemporal relational queries using an unmodified temporal streaming engine, by re-interpreting temporal event fields to denote progress. Based on Prism, we build Now!, a progressive data-parallel computation framework for Windows Azure, where progress is understood as a first-class citizen in the framework. Now! works with ``progress-aware reducers''- in particular, it works with streaming engines to support progressive SQL over big data. Extensive experiments on Windows Azure with real and synthetic workloads validate the scalability and benefits of Now! and its optimizations, over current solutions for progressive analytics.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ogden:2013:SXQ, author = "Peter Ogden and David Thomas and Peter Pietzuch", title = "Scalable {XML} query processing using parallel pushdown transducers", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1738--1749", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In online social networking, network monitoring and financial applications, there is a need to query high rate streams of XML data, but methods for executing individual XPath queries on streaming XML data have not kept pace with multicore CPUs. For data-parallel processing, a single XML stream is typically split into well-formed fragments, which are then processed independently. Such an approach, however, introduces a sequential bottleneck and suffers from low cache locality, limiting its scalability across CPU cores. We describe a data-parallel approach for the processing of streaming XPath queries based on pushdown transducers. Our approach permits XML data to be split into arbitrarily-sized chunks, with each chunk processed by a parallel automaton instance. Since chunks may be malformed, our automata consider all possible starting states for XML elements and build mappings from starting to finishing states. These mappings can be constructed independently for each chunk by different CPU cores. For streaming queries from the XPathMark benchmark, we show a processing throughput of 2.5 GB/s, with near linear scaling up to 64 CPU cores.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huai:2013:UIB, author = "Yin Huai and Siyuan Ma and Rubao Lee and Owen O'Malley and Xiaodong Zhang", title = "Understanding insights into the basic structure and essential issues of table placement methods in clusters", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1750--1761", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A table placement method is a critical component in big data analytics on distributed systems. It determines the way how data values in a two-dimensional table are organized and stored in the underlying cluster. Based on Hadoop computing environments, several table placement methods have been proposed and implemented. However, a comprehensive and systematic study to understand, to compare, and to evaluate different table placement methods has not been done. Thus, it is highly desirable to gain important insights into the basic structure and essential issues of table placement methods in the context of big data processing infrastructures. In this paper, we present such a study. The basic structure of a data placement method consists of three core operations: row reordering, table partitioning, and data packing. All the existing placement methods are formed by these core operations with variations made by the three key factors: (1) the size of a horizontal logical subset of a table (or the size of a row group), (2) the function of mapping columns to column groups, and (3) the function of packing columns or column groups in a row group into physical blocks. We have designed and implemented a benchmarking tool to provide insights into how variations of each factor affect the I/O performance of reading data of a table stored by a table placement method. Based on our results, we give suggested actions to optimize table reading performance. Results from large-scale experiments have also confirmed that our findings are valid for production workloads. Finally, we present ORC File as a case study to show the effectiveness of our findings and suggested actions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mottin:2013:POF, author = "Davide Mottin and Alice Marascu and Senjuti Basu Roy and Gautam Das and Themis Palpanas and Yannis Velegrakis", title = "A probabilistic optimization framework for the empty-answer problem", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1762--1773", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose a principled optimization-based interactive query relaxation framework for queries that return no answers. Given an initial query that returns an empty answer set, our framework dynamically computes and suggests alternative queries with less conditions than those the user has initially requested, in order to help the user arrive at a query with a non-empty answer, or at a query for which no matter how many additional conditions are ignored, the answer will still be empty. Our proposed approach for suggesting query relaxations is driven by a novel probabilistic framework based on optimizing a wide variety of application-dependent objective functions. We describe optimal and approximate solutions of different optimization problems using the framework. We analyze these solutions, experimentally verify their efficiency and effectiveness, and illustrate their advantage over the existing approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2013:SAG, author = "Yinghui Wu and Shengqi Yang and Mudhakar Srivatsa and Arun Iyengar and Xifeng Yan", title = "Summarizing answer graphs induced by keyword queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1774--1785", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Keyword search has been popularly used to query graph data. Due to the lack of structure support, a keyword query might generate an excessive number of matches, referred to as ``answer graphs'', that could include different relationships among keywords. An ignored yet important task is to group and summarize answer graphs that share similar structures and contents for better query interpretation and result understanding. This paper studies the summarization problem for the answer graphs induced by a keyword query $Q$. (1) A notion of summary graph is proposed to characterize the summarization of answer graphs. Given $Q$ and a set of answer graphs $G$, a summary graph preserves the relation of the keywords in $Q$ by summarizing the paths connecting the keywords nodes in $G$. (2) A quality metric of summary graphs, called coverage ratio, is developed to measure information loss of summarization. (3) Based on the metric, a set of summarization problems are formulated, which aim to find minimized summary graphs with certain coverage ratio. (a) We show that the complexity of these summarization problems ranges from ptime to NP-complete. (b) We provide exact and heuristic summarization algorithms. (4) Using real-life and synthetic graphs, we experimentally verify the effectiveness and the efficiency of our techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Duan:2013:SKS, author = "Huizhong Duan and ChengXiang Zhai and Jinxing Cheng and Abhishek Gattani", title = "Supporting keyword search in product database: a probabilistic approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1786--1797", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The ability to let users search for products conveniently in product database is critical to the success of e-commerce. Although structured query languages (e.g. SQL) can be used to effectively access the product database, it is very difficult for end users to learn and use. In this paper, we study how to optimize search over structured product entities (represented by specifications) with keyword queries such as ``cheap gaming laptop''. One major difficulty in this problem is the vocabulary gap between the specifications of products in the database and the keywords people use in search queries. To solve the problem, we propose a novel probabilistic entity retrieval model based on query generation, where the entities would be ranked for a given keyword query based on the likelihood that a user who likes an entity would pose the query. Different ways to estimate the model parameters would lead to different variants of ranking functions. We start with simple estimates based on the specifications of entities, and then leverage user reviews and product search logs to improve the estimation. Multiple estimation algorithms are developed based on Maximum Likelihood and Maximum a Posteriori estimators. We evaluate the proposed product entity retrieval models on two newly created product search test collections. The results show that the proposed model significantly outperforms the existing retrieval models, benefiting from the modeling of attribute-level relevance. Despite the focus on product retrieval, the proposed modeling method is general and opens up many new opportunities in analyzing structured entity data with unstructured text data. We show the proposed probabilistic model can be easily adapted for many interesting applications including facet generation and review annotation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nirkhiwale:2013:SAA, author = "Supriya Nirkhiwale and Alin Dobra and Christopher Jermaine", title = "A sampling algebra for aggregate estimation", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1798--1809", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As of 2005, sampling has been incorporated in all major database systems. While efficient sampling techniques are realizable, determining the accuracy of an estimate obtained from the sample is still an unresolved problem. In this paper, we present a theoretical framework that allows an elegant treatment of the problem. We base our work on generalized uniform sampling (GUS), a class of sampling methods that subsumes a wide variety of sampling techniques. We introduce a key notion of equivalence that allows GUS sampling operators to commute with selection and join, and derivation of confidence intervals. We illustrate the theory through extensive examples and give indications on how to use it to provide meaningful estimates in database systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dylla:2013:TPD, author = "Maximilian Dylla and Iris Miliaraki and Martin Theobald", title = "A temporal-probabilistic database model for information extraction", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1810--1821", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Temporal annotations of facts are a key component both for building a high-accuracy knowledge base and for answering queries over the resulting temporal knowledge base with high precision and recall. In this paper, we present a temporal-probabilistic database model for cleaning uncertain temporal facts obtained from information extraction methods. Specifically, we consider a combination of temporal deduction rules, temporal consistency constraints and probabilistic inference based on the common possible-worlds semantics with data lineage, and we study the theoretical properties of this data model. We further develop a query engine which is capable of scaling to very large temporal knowledge bases, with nearly interactive query response times over millions of uncertain facts and hundreds of thousands of grounded rules. Our experiments over two real-world datasets demonstrate the increased robustness of our approach compared to related techniques based on constraint solving via Integer Linear Programming (ILP) and probabilistic inference via Markov Logic Networks (MLNs). We are also able to show that our runtime performance is more than competitive to current ILP solvers and the fastest available, probabilistic but non-temporal, database engines.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fender:2013:CSG, author = "Pit Fender and Guido Moerkotte", title = "Counter strike: generic top-down join enumeration for hypergraphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1822--1833", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Finding the optimal execution order of join operations is a crucial task of today's cost-based query optimizers. There are two approaches to identify the best plan: bottom-up and top-down join enumeration. But only the top-down approach allows for branch-and-bound pruning, which can improve compile time by several orders of magnitude while still preserving optimality. For both optimization strategies, efficient enumeration algorithms have been published. However, there are two severe limitations for the top-down approach: The published algorithms can handle only (1) simple (binary) join predicates and (2) inner joins. Since real queries may contain complex join predicates involving more than two relations, and outer joins as well as other non-inner joins, efficient top-down join enumeration cannot be used in practice yet. We develop a novel top-down join enumeration algorithm that overcomes these two limitations. Furthermore, we show that our new algorithm is competitive when compared to the state of the art in bottom-up processing even without playing out its advantage by making use of its branch-and-bound pruning capabilities.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Achakeev:2013:EBU, author = "Daniar Achakeev and Bernhard Seeger", title = "Efficient bulk updates on multiversion {B}-trees", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1834--1845", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Partial persistent index structures support efficient access to current and past versions of objects, while updates are allowed on the current version. The Multiversion B-Tree (MVBT) represents a partially persistent index-structure with both, asymptotic worst-case performance and excellent performance in real life applications. Updates are performed tuple-by-tuple with the same asymptotic performance as for standard B+trees. To the best of our knowledge, there is no efficient algorithm for bulk loading and bulk update of MVBT and other partially persistent index structures. In this paper, we propose the first loading algorithm for MVBT that meets the lower-bound of external sorting. In addition, our approach is also applicable to bulk updates. This is achieved by combining two basic technologies, weight balancing and buffer tree. Our extensive set of experiments confirm the theoretical findings: Our loading algorithm runs considerably faster than performing updates tuple-by-tuple.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Altwaijry:2013:QDA, author = "Hotham Altwaijry and Dmitri V. Kalashnikov and Sharad Mehrotra", title = "Query-driven approach to entity resolution", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1846--1857", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper explores ``on-the-fly'' data cleaning in the context of a user query. A novel Query-Driven Approach (QDA) is developed that performs a minimal number of cleaning steps that are only necessary to answer a given selection query correctly. The comprehensive empirical evaluation of the proposed approach demonstrates its significant advantage in terms of efficiency over traditional techniques for query-driven applications.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Szlichta:2013:ECO, author = "Jaros{\l}aw Szlichta and Parke Godfrey and Jarek Gryz and Calisto Zuzarte", title = "Expressiveness and complexity of order dependencies", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1858--1869", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Dependencies play an important role in databases. We study order dependencies (ODs)--and unidirectional order dependencies (UODs), a proper sub-class of ODs--which describe the relationships among lexicographical orderings of sets of tuples. We consider lexicographical ordering, as by the order-by operator in SQL, because this is the notion of order used in SQL and within query optimization. Our main goal is to investigate the inference problem for ODs, both in theory and in practice. We show the usefulness of ODs in query optimization. We establish the following theoretical results: (i) a hierarchy of order dependency classes; (ii) a proof of co-NP-completeness of the inference problem for the subclass of UODs (and ODs); (iii) a proof of co-NP-completeness of the inference problem of functional dependencies (FDs) from ODs in general, but demonstrate linear time complexity for the inference of FDs from UODs; (iv) a sound and complete elimination procedure for inference over ODs; and (v) a sound and complete polynomial inference algorithm for sets of UODs over restricted domains.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pavan:2013:CST, author = "A. Pavan and Kanat Tangwongsan and Srikanta Tirthapura and Kun-Lung Wu", title = "Counting and sampling triangles from a graph stream", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1870--1881", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents a new space-efficient algorithm for counting and sampling triangles--and more generally, constant-sized cliques--in a massive graph whose edges arrive as a stream. Compared to prior work, our algorithm yields significant improvements in the space and time complexity for these fundamental problems. Our algorithm is simple to implement and has very good practical performance on large graphs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sowell:2013:EAI, author = "Benjamin Sowell and Marcos Vaz Salles and Tuan Cao and Alan Demers and Johannes Gehrke", title = "An experimental analysis of iterated spatial joins in main memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1882--1893", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many modern applications rely on high-performance processing of spatial data. Examples include location-based services, games, virtual worlds, and scientific simulations such as molecular dynamics and behavioral simulations. These applications deal with large numbers of moving objects that continuously sense their environment, and their data access can often be abstracted as a repeated spatial join. Updates to object positions are interspersed with these join operations, and batched for performance. Even for the most demanding scenarios, the data involved in these joins fits comfortably in the main memory of a cluster of machines, and most applications run completely in main memory for performance reasons. Choosing appropriate spatial join algorithms is challenging due to the large number of techniques in the literature. In this paper, we perform an extensive evaluation of repeated spatial join algorithms for distance (range) queries in main memory. Our study is unique in breadth when compared to previous work: We implement, tune, and compare ten distinct algorithms on several workloads drawn from the simulation and spatial indexing literature. We explore the design space of both index nested loops algorithms and specialized join algorithms, as well as the use of moving object indices that can be incrementally maintained. Surprisingly, we find that when queries and updates can be batched, repeatedly re-computing the join result from scratch outperforms using a moving object index in all but the most extreme cases. This suggests that--given the code complexity of index structures for moving objects --- specialized join strategies over simple index structures, such as Synchronous Traversal over R-Trees, should be the methods of choice for the above applications.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lee:2013:SQB, author = "Kisung Lee and Ling Liu", title = "Scaling queries over big {RDF} graphs with semantic hash partitioning", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1894--1905", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Massive volumes of big RDF data are growing beyond the performance capacity of conventional RDF data management systems operating on a single node. Applications using large RDF data demand efficient data partitioning solutions for supporting RDF data access on a cluster of compute nodes. In this paper we present a novel semantic hash partitioning approach and implement a Semantic HAsh Partitioning-Enabled distributed RDF data management system, called Shape. This paper makes three original contributions. First, the semantic hash partitioning approach we propose extends the simple hash partitioning method through direction-based triple groups and direction-based triple replications. The latter enhances the former by controlled data replication through intelligent utilization of data access locality, such that queries over big RDF graphs can be processed with zero or very small amount of inter-machine communication cost. Second, we generate locality-optimized query execution plans that are more efficient than popular multi-node RDF data management systems by effectively minimizing the inter-machine communication cost for query processing. Third but not the least, we provide a suite of locality-aware optimization techniques to further reduce the partition size and cut down on the inter-machine communication cost during distributed query processing. Experimental results show that our system scales well and can process big RDF datasets more efficiently than existing approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Seo:2013:DSD, author = "Jiwon Seo and Jongsoo Park and Jaeho Shin and Monica S. Lam", title = "Distributed socialite: a datalog-based language for large-scale graph analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1906--1917", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large-scale graph analysis is becoming important with the rise of world-wide social network services. Recently in SociaLite, we proposed extensions to Datalog to efficiently and succinctly implement graph analysis programs on sequential machines. This paper describes novel extensions and optimizations of SociaLite for parallel and distributed executions to support large-scale graph analysis. With distributed SociaLite, programmers simply annotate how data are to be distributed, then the necessary communication is automatically inferred to generate parallel code for cluster of multi-core machines. It optimizes the evaluation of recursive monotone aggregate functions using a delta stepping technique. In addition, approximate computation is supported in SociaLite, allowing programmers to trade off accuracy for less time and space. We evaluated SociaLite with six core graph algorithms used in many social network analyses. Our experiment with 64 Amazon EC2 8-core instances shows that SociaLite programs performed within a factor of two with respect to ideal weak scaling. Compared to optimized Giraph, an open-source alternative of Pregel, SociaLite programs are 4 to 12 times faster across benchmark algorithms, and 22 times more succinct on average. As a declarative query language, SociaLite, with the help of a compiler that generates efficient parallel and approximate code, can be used easily to create many social apps that operate on large-scale distributed graphs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sarwat:2013:HDS, author = "Mohamed Sarwat and Sameh Elnikety and Yuxiong He and Mohamed F. Mokbel", title = "{Horton+}: a distributed system for processing declarative reachability queries over partitioned graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1918--1929", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Horton+ is a graph query processing system that executes declarative reachability queries on a partitioned attributed multi-graph. It employs a query language, query optimizer, and a distributed execution engine. The query language expresses declarative reachability queries, and supports closures and predicates on node and edge attributes to match graph paths. We introduce three algebraic operators, select, traverse, and join, and a query is compiled into an execution plan containing these operators. As reachability queries access the graph elements in a random access pattern, the graph is therefore maintained in the main memory of a cluster of servers to reduce query execution time. We develop a distributed execution engine that processes a query plan in parallel on the graph servers. Since the query language is declarative, we build a query optimizer that uses graph statistics to estimate predicate selectivity. We experimentally evaluate the system performance on a cluster of 16 graph servers using synthetic graphs as well as a real graph from an application that uses reachability queries. The evaluation shows (1) the efficiency of the optimizer in reducing query execution time, (2) system scalability with the size of the graph and with the number of servers, and (3) the convenience of using declarative queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sundaram:2013:SSS, author = "Narayanan Sundaram and Aizana Turmukhametova and Nadathur Satish and Todd Mostak and Piotr Indyk and Samuel Madden and Pradeep Dubey", title = "Streaming similarity search over one billion tweets using parallel locality-sensitive hashing", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1930--1941", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Finding nearest neighbors has become an important operation on databases, with applications to text search, multimedia indexing, and many other areas. One popular algorithm for similarity search, especially for high dimensional data (where spatial indexes like kd-trees do not perform well) is Locality Sensitive Hashing (LSH), an approximation algorithm for finding similar objects. In this paper, we describe a new variant of LSH, called Parallel LSH (PLSH) designed to be extremely efficient, capable of scaling out on multiple nodes and multiple cores, and which supports high-throughput streaming of new data. Our approach employs several novel ideas, including: cache-conscious hash table layout, using a 2-level merge algorithm for hash table construction; an efficient algorithm for duplicate elimination during hash-table querying; an insert-optimized hash table structure and efficient data expiration algorithm for streaming data; and a performance model that accurately estimates performance of the algorithm and can be used to optimize parameter settings. We show that on a workload where we perform similarity search on a dataset of > 1 Billion tweets, with hundreds of millions of new tweets per day, we can achieve query times of 1--2.5 ms. We show that this is an order of magnitude faster than existing indexing schemes, such as inverted indexes. To the best of our knowledge, this is the fastest implementation of LSH, with table construction times up to $ 3.7 \times $ faster and query times that are $ 8.3 \times $ faster than a basic implementation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{DeBrabant:2013:ACN, author = "Justin DeBrabant and Andrew Pavlo and Stephen Tu and Michael Stonebraker and Stan Zdonik", title = "{Anti-caching}: a new approach to database management system architecture", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1942--1953", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The traditional wisdom for building disk-based relational database management systems (DBMS) is to organize data in heavily-encoded blocks stored on disk, with a main memory block cache. In order to improve performance given high disk latency, these systems use a multi-threaded architecture with dynamic record-level locking that allows multiple transactions to access the database at the same time. Previous research has shown that this results in substantial overhead for on-line transaction processing (OLTP) applications [15]. The next generation DBMSs seek to overcome these limitations with architecture based on main memory resident data. To overcome the restriction that all data fit in main memory, we propose a new technique, called anti-caching, where cold data is moved to disk in a transactionally-safe manner as the database grows in size. Because data initially resides in memory, an anti-caching architecture reverses the traditional storage hierarchy of disk-based systems. Main memory is now the primary storage device. We implemented a prototype of our anti-caching proposal in a high-performance, main memory OLTP DBMS and performed a series of experiments across a range of database sizes, workload skews, and read/write mixes. We compared its performance with an open-source, disk-based DBMS optionally fronted by a distributed main memory cache. Our results show that for higher skewed workloads the anti-caching architecture has a performance advantage over either of the other architectures tested of up to $ 9 \times $ for a data size $ 8 \times $ larger than memory.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qardaji:2013:UHM, author = "Wahbeh Qardaji and Weining Yang and Ninghui Li", title = "Understanding hierarchical methods for differentially private histograms", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1954--1965", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In recent years, many approaches to differentially privately publish histograms have been proposed. Several approaches rely on constructing tree structures in order to decrease the error when answer large range queries. In this paper, we examine the factors affecting the accuracy of hierarchical approaches by studying the mean squared error (MSE) when answering range queries. We start with one-dimensional histograms, and analyze how the MSE changes with different branching factors, after employing constrained inference, and with different methods to allocate the privacy budget among hierarchy levels. Our analysis and experimental results show that combining the choice of a good branching factor with constrained inference outperform the current state of the art. Finally, we extend our analysis to multi-dimensional histograms. We show that the benefits from employing hierarchical methods beyond a single dimension are significantly diminished, and when there are 3 or more dimensions, it is almost always better to use the Flat method instead of a hierarchy.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2013:TSD, author = "Rui Li and Shengjie Wang and Kevin Chen-Chuan Chang", title = "Towards social data platform: automatic topic-focused monitor for {Twitter} stream", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1966--1977", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many novel applications have been built based on analyzing tweets about specific topics. While these applications provide different kinds of analysis, they share a common task of monitoring ``target'' tweets from the Twitter stream for a topic. The current solution for this task tracks a set of manually selected keywords with Twitter APIs. Obviously, this manual approach has many limitations. In this paper, we propose a data platform to automatically monitor target tweets from the Twitter stream for any given topic. To monitor target tweets in an optimal and continuous way, we design Automatic Topic-focused Monitor (ATM), which iteratively (1) samples tweets from the stream and (2) selects keywords to track based on the samples. To realize ATM, we develop a tweet sampling algorithm to sample sufficient unbiased tweets with available Twitter APIs, and a keyword selection algorithm to efficiently select keywords that have a near-optimal coverage of target tweets under cost constraints. We conduct extensive experiments to show the effectiveness of ATM. E.g., ATM covers 90\% of target tweets for a topic and improves the manual approach by 49\%.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jin:2013:SFS, author = "Ruoming Jin and Guan Wang", title = "Simple, fast, and scalable reachability oracle", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1978--1989", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A reachability oracle (or hop labeling) assigns each vertex $v$ two sets of vertices: $ {\rm Lout}(v) $ and $ {\rm Lin}(v) $, such that $u$ reaches $v$ iff $ {\rm Lout}(u) \cap {\rm Lin}(v) \neq 0 $. Despite their simplicity and elegance, reachability oracles have failed to achieve efficiency in more than ten years since their introduction: The main problem is high construction cost, which stems from a set-cover framework and the need to materialize transitive closure. In this paper, we present two simple and efficient labeling algorithms, Hierarchical-Labeling and Distribution-Labeling, which can work onmassive real-world graphs: Their construction time is an order of magnitude faster than the set-cover based labeling approach, and transitive closure materialization is not needed. On large graphs, their index sizes and their query performance can now beat the state-of-the-art transitive closure compression and online search approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bakibayev:2013:AOF, author = "Nurzhan Bakibayev and Tom{\'a}s Kocisk{\'y} and Dan Olteanu and Jakub Z{\'a}vodn{\'y}", title = "Aggregation and ordering in factorised databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "1990--2001", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A common approach to data analysis involves understanding and manipulating succinct representations of data. In earlier work, we put forward a succinct representation system for relational data called factorised databases and reported on the main-memory query engine FDB for select-project-join queries on such databases. In this paper, we extend FDB to support a larger class of practical queries with aggregates and ordering. This requires novel optimisation and evaluation techniques. We show how factorisation coupled with partial aggregation can effectively reduce the number of operations needed for query evaluation. We also show how factorisations of query results can support enumeration of tuples in desired orders as efficiently as listing them from the unfactorised, sorted results. We experimentally observe that FDB can outperform off-the-shelf relational engines by orders of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Park:2013:PCS, author = "Yoonjae Park and Jun-Ki Min and Kyuseok Shim", title = "Parallel computation of skyline and reverse skyline queries using {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "2002--2013", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The skyline operator and its variants such as dynamic skyline and reverse skyline operators have attracted considerable attention recently due to their broad applications. However, computations of such operators are challenging today since there is an increasing trend of applications to deal with big data. For such data-intensive applications, the MapReduce framework has been widely used recently. In this paper, we propose efficient parallel algorithms for processing the skyline and its variants using MapReduce. We first build histograms to effectively prune out nonskyline (non-reverse skyline) points in advance. We next partition data based on the regions divided by the histograms and compute candidate (reverse) skyline points for each region independently using MapReduce. Finally, we check whether each candidate point is actually a (reverse) skyline point in every region independently. Our performance study confirms the effectiveness and scalability of the proposed algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xie:2013:FIG, author = "Wenlei Xie and Guozhang Wang and David Bindel and Alan Demers and Johannes Gehrke", title = "Fast iterative graph computation with block updates", journal = j-PROC-VLDB-ENDOWMENT, volume = "6", number = "14", pages = "2014--2025", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Dec 13 05:57:13 MST 2013", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Scaling iterative graph processing applications to large graphs is an important problem. Performance is critical, as data scientists need to execute graph programs many times with varying parameters. The need for a high-level, high-performance programming model has inspired much research on graph programming frameworks. In this paper, we show that the important class of computationally light graph applications --- applications that perform little computation per vertex --- has severe scalability problems across multiple cores as these applications hit an early ``memory wall'' that limits their speedup. We propose a novel block-oriented computation model, in which computation is iterated locally over blocks of highly connected nodes, significantly improving the amount of computation per cache miss. Following this model, we describe the design and implementation of a block-aware graph processing runtime that keeps the familiar vertex-centric programming paradigm while reaping the benefits of block-oriented execution. Our experiments show that block-oriented execution significantly improves the performance of our framework for several graph applications.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2013:EEK, author = "Xiaoli Wang and Xiaofeng Ding and Anthony K. H. Tung and Zhenjie Zhang", title = "Efficient and effective {KNN} sequence search with approximate $n$-grams", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "1", pages = "1--12", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:21:56 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we address the problem of finding $k$-nearest neighbors (KNN) in sequence databases using the edit distance. Unlike most existing works using short and exact $n$-gram matchings together with a filter-and-refine framework for KNN sequence search, our new approach allows us to use longer but approximate $n$-gram matchings as a basis of KNN candidates pruning. Based on this new idea, we devise a pipeline framework over a two-level index for searching KNN in the sequence database. By coupling this framework together with several efficient filtering strategies, i.e. the frequency queue and the well-known Combined Algorithm (CA), our proposal brings various enticing advantages over existing works, including (1) huge reduction on false positive candidates to avoid large overheads on candidate verifications; (2) progressive result update and early termination; and (3) good extensibility to parallel computation. We conduct extensive experiments on three real datasets to verify the superiority of the proposed framework.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yu:2013:MSE, author = "Weiren Yu and Xuemin Lin and Wenjie Zhang and Lijun Chang and Jian Pei", title = "More is simpler: effectively and efficiently assessing node-pair similarities based on hyperlinks", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "1", pages = "13--24", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:21:56 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Similarity assessment is one of the core tasks in hyperlink analysis. Recently, with the proliferation of applications, e.g., web search and collaborative filtering, SimRank has been a well-studied measure of similarity between two nodes in a graph. It recursively follows the philosophy that ``two nodes are similar if they are referenced (have incoming edges) from similar nodes'', which can be viewed as an aggregation of similarities based on incoming paths. Despite its popularity, SimRank has an undesirable property, i.e., ``zero-similarity'': It only accommodates paths with equal length from a common ``center'' node. Thus, a large portion of other paths are fully ignored. This paper attempts to remedy this issue. (1) We propose and rigorously justify SimRank*, a revised version of SimRank, which resolves such counter-intuitive ``zero-similarity'' issues while inheriting merits of the basic SimRank philosophy. (2) We show that the series form of SimRank* can be reduced to a fairly succinct and elegant closed form, which looks even simpler than SimRank, yet enriches semantics without suffering from increased computational cost. This leads to a fixed-point iterative paradigm of SimRank* in $ O (K n m) $ time on a graph of $n$ nodes and $m$ edges for $K$ iterations, which is comparable to SimRank. (3) To further optimize SimRank* computation, we leverage a novel clustering strategy via edge concentration. Due to its NP-hardness, we devise an efficient and effective heuristic to speed up SimRank* computation to $ O(K n m)$ time, where $m$ is generally much smaller than $m$. (4) Using real and synthetic data, we empirically verify the rich semantics of SimRank*, and demonstrate its high computation efficiency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gyssens:2013:ATS, author = "Marc Gyssens and Jan Paredaens and Dirk {Van Gucht} and Jef Wijsen and Yuqing Wu", title = "An approach towards the study of symmetric queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "1", pages = "25--36", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:21:56 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many data-intensive applications have to query a database that involves sequences of sets of objects. It is not uncommon that the order of the sets in such a sequence does not affect the result of the query. Such queries are called symmetric. In this paper, the authors wish to initiate research on symmetric queries. Thereto, a data model is proposed in which a binary relation between objects and set names encodes set membership. On this data model, two query languages are introduced, QuineCALC and SyCALC. They are correlated in a manner that is made precise with the symmetric Boolean functions of Quine, respectively symmetric relational functions, on sequences of sets of given length. The latter do not only involve the Boolean operations union, intersection, and complement, but also projection and Cartesian product. Quine's characterization of symmetric Boolean functions in terms of incidence information is generalized to QuineCALC queries. In the process, an incidence-based normal form for QuineCALC queries is proposed. Inspired by these desirable incidence-related properties of QuineCALC queries, counting-only queries are introduced as SyCALC queries for which the result only depends on incidence information. Counting-only queries are then characterized as quantified Boolean combinations of QuineCALC queries, and a normal form is proposed for them as well. Finally, it is shown that, while it is undecidable whether a SyCALC query is counting-only, it is decidable whether a counting-only query is a QuineCALC query.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Das:2013:CST, author = "Sudipto Das and Vivek R. Narasayya and Feng Li and Manoj Syamala", title = "{CPU} sharing techniques for performance isolation in multi-tenant relational database-as-a-service", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "1", pages = "37--48", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:21:56 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Multi-tenancy and resource sharing are essential to make a Database-as-a-Service (DaaS) cost-effective. However, one major consequence of resource sharing is that the performance of one tenant's workload can be significantly affected by the resource demands of co-located tenants. The lack of performance isolation in a shared environment can make DaaS less attractive to performance-sensitive tenants. Our approach to performance isolation in a DaaS is to isolate the key resources needed by the tenants' workload. In this paper, we focus on the problem of effectively sharing and isolating CPU among co-located tenants in a multi-tenant DaaS. We show that traditional CPU sharing abstractions and algorithms are inadequate to support several key new requirements that arise in DaaS: (a) absolute and fine-grained CPU reservations without static allocation; (b) support elasticity by dynamically adapting to bursty resource demands; and (c) enable the DaaS provider to suitably tradeoff revenue with fairness. We implemented these new scheduling algorithms in a commercial DaaS prototype and extensive experiments demonstrate the effectiveness of our techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2013:ATK, author = "Qian Chen and Haibo Hu and Jianliang Xu", title = "Authenticating top-$k$ queries in location-based services with confidentiality", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "1", pages = "49--60", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:21:56 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "State-of-the-art location-based services (LBSs) involve data owners, requesting clients, and service providers. As LBSs become new business opportunities, there is an increasing necessity to verify the genuineness of service results. Unfortunately, while traditional query authentication techniques can address this issue, they fail to protect the confidentiality of data, which is sensitive location information when LBSs are concerned. Recent work has studied how to preserve such location privacy in query authentication. However, the prior work is limited to range queries, where private values only appear on one side of the range comparison. In this paper, we address the more challenging authentication problem on top-$k$ queries, where private values appear on both sides of a comparison. To start with, we propose two novel cryptographic building blocks, followed by a comprehensive design of authentication schemes for top-$k$ queries based on R-tree and Power Diagram indexes. Optimizations, security analysis, and experimental results consistently show the effectiveness and robustness of the proposed schemes under various system settings and query workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qi:2013:TDO, author = "Zichao Qi and Yanghua Xiao and Bin Shao and Haixun Wang", title = "Toward a distance oracle for billion-node graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "1", pages = "61--72", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:21:56 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The emergence of real life graphs with billions of nodes poses significant challenges for managing and querying these graphs. One of the fundamental queries submitted to graphs is the shortest distance query. Online BFS (breadth-first search) and offline pre-computing pairwise shortest distances are prohibitive in time or space complexity for billion-node graphs. In this paper, we study the feasibility of building distance oracles for billion-node graphs. A distance oracle provides approximate answers to shortest distance queries by using a pre-computed data structure for the graph. Sketch-based distance oracles are good candidates because they assign each vertex a sketch of bounded size, which means they have linear space complexity. However, state-of-the-art sketch-based distance oracles lack efficiency or accuracy when dealing with big graphs. In this paper, we address the scalability and accuracy issues by focusing on optimizing the three key factors that affect the performance of distance oracles: landmark selection, distributed BFS, and answer generation. We conduct extensive experiments on both real networks and synthetic networks to show that we can build distance oracles of affordable cost and efficiently answer shortest distance queries even for billion-node graphs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kaul:2013:FSP, author = "Manohar Kaul and Raymond Chi-Wing Wong and Bin Yang and Christian S. Jensen", title = "Finding shortest paths on terrains by killing two birds with one stone", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "1", pages = "73--84", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:21:56 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the increasing availability of terrain data, e.g., from aerial laser scans, the management of such data is attracting increasing attention in both industry and academia. In particular, spatial queries, e.g., $k$-nearest neighbor and reverse nearest neighbor queries, in Euclidean and spatial network spaces are being extended to terrains. Such queries all rely on an important operation, that of finding shortest surface distances. However, shortest surface distance computation is very time consuming. We propose techniques that enable efficient computation of lower and upper bounds of the shortest surface distance, which enable faster query processing by eliminating expensive distance computations. Empirical studies show that our bounds are much tighter than the best-known bounds in many cases and that they enable speedups of up to 43 times for some well-known spatial queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Balkesen:2013:MCM, author = "Cagri Balkesen and Gustavo Alonso and Jens Teubner and M. Tamer {\"O}zsu", title = "Multi-core, main-memory joins: sort vs. hash revisited", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "1", pages = "85--96", month = sep, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:21:56 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper we experimentally study the performance of main-memory, parallel, multi-core join algorithms, focusing on sort-merge and (radix-)hash join. The relative performance of these two join approaches have been a topic of discussion for a long time. With the advent of modern multi-core architectures, it has been argued that sort-merge join is now a better choice than radix-hash join. This claim is justified based on the width of SIMD instructions (sort-merge outperforms radix-hash join once SIMD is sufficiently wide), and NUMA awareness (sort-merge is superior to hash join in NUMA architectures). We conduct extensive experiments on the original and optimized versions of these algorithms. The experiments show that, contrary to these claims, radix-hash join is still clearly superior, and sort-merge approaches to performance of radix only when very large amounts of data are involved. The paper also provides the fastest implementations of these algorithms, and covers many aspects of modern hardware architectures relevant not only for joins but for any parallel data processing operator.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Schuhknecht:2013:UPD, author = "Felix Martin Schuhknecht and Alekh Jindal and Jens Dittrich", title = "The uncracked pieces in database cracking", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "2", pages = "97--108", month = oct, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:21:58 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database cracking has been an area of active research in recent years. The core idea of database cracking is to create indexes adaptively and incrementally as a side-product of query processing. Several works have proposed different cracking techniques for different aspects including updates, tuple-reconstruction, convergence, concurrency-control, and robustness. However, there is a lack of any comparative study of these different methods by an independent group. In this paper, we conduct an experimental study on database cracking. Our goal is to critically review several aspects, identify the potential, and propose promising directions in database cracking. With this study, we hope to expand the scope of database cracking and possibly leverage cracking in database engines other than MonetDB. We repeat several prior database cracking works including the core cracking algorithms as well as three other works on convergence (hybrid cracking), tuple-reconstruction (sideways cracking), and robustness (stochastic cracking) respectively. We evaluate these works and show possible directions to do even better. We further test cracking under a variety of experimental settings, including high selectivity queries, low selectivity queries, and multiple query access patterns. Finally, we compare cracking against different sorting algorithms as well as against different main-memory optimised indexes, including the recently proposed Adaptive Radix Tree (ART). Our results show that: (i) the previously proposed cracking algorithms are repeatable, (ii) there is still enough room to significantly improve the previously proposed cracking algorithms, (iii) cracking depends heavily on query selectivity, (iv) cracking needs to catch up with modern indexing trends, and (v) different indexing algorithms have different indexing signatures.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Eravci:2013:DBR, author = "Bahaeddin Eravci and Hakan Ferhatosmanoglu", title = "Diversity based relevance feedback for time series search", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "2", pages = "109--120", month = oct, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:21:58 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose a diversity based relevance feedback approach for time series data to improve the accuracy of search results. We first develop the concept of relevance feedback for time series based on dual-tree complex wavelet (CWT) and SAX based approaches. We aim to enhance the search quality by incorporating diversity in the results presented to the user for feedback. We then propose a method which utilizes the representation type as part of the feedback, as opposed to a human choosing based on a preprocessing or training phase. The proposed methods utilize a weighting to handle the relevance feedback of important properties for both single and multiple representation cases. Our experiments on a large variety of time series data sets show that the proposed diversity based relevance feedback improves the retrieval performance. Results confirm that representation feedback incorporates item diversity implicitly and achieves good performance even when using simple nearest neighbor as the retrieval method. To the best of our knowledge, this is the first study on diversification of time series search to improve retrieval accuracy and representation feedback.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pelley:2013:SMN, author = "Steven Pelley and Thomas F. Wenisch and Brian T. Gold and Bill Bridge", title = "Storage management in the {NVRAM} era", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "2", pages = "121--132", month = oct, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:21:58 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Emerging nonvolatile memory technologies (NVRAM) offer an alternative to disk that is persistent, provides read latency similar to DRAM, and is byte-addressable. Such NVRAMs could revolutionize online transaction processing (OLTP), which today must employ sophisticated optimizations with substantial software overheads to overcome the long latency and poor random access performance of disk. Nevertheless, many candidate NVRAM technologies exhibit their own limitations, such as greater-than-DRAM latency, particularly for writes. In this paper, we reconsider OLTP durability management to optimize recovery performance and forward-processing throughput for emerging NVRAMs. First, we demonstrate that using NVRAM as a drop-in replacement for disk allows near-instantaneous recovery, but software complexity necessary for disk (i.e., Write Ahead Logging/ARIES) limits transaction throughput. Next, we consider the possibility of removing software-managed DRAM buffering. Finally, we measure the cost of ordering writes to NVRAM, which is vital for correct recovery. We consider three recovery mechanisms: NVRAM Disk-Replacement, In-Place Updates (transactions persist data in-place), and NVRAM Group Commit (transactions commit/persist atomically in batches). Whereas In-Place Updates offers the simplest design, it introduces persist synchronizations at every page update. NVRAM Group Commit minimizes persist synchronization, offering up to a 50\% throughput improvement for large synchronous persist latencies.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Salloum:2013:OOO, author = "Mariam Salloum and Xin Luna Dong and Divesh Srivastava and Vassilis J. Tsotras", title = "Online ordering of overlapping data sources", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "3", pages = "133--144", month = nov, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:00 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data integration systems offer a uniform interface for querying a large number of autonomous and heterogeneous data sources. Ideally, answers are returned as sources are queried and the answer list is updated as more answers arrive. Choosing a good ordering in which the sources are queried is critical for increasing the rate at which answers are returned. However, this problem is challenging since we often do not have complete or precise statistics of the sources, such as their coverage and overlap. It is further exacerbated in the Big Data era, which is witnessing two trends in Deep-Web data: first, obtaining a full coverage of data in a particular domain often requires extracting data from thousands of sources; second, there is often a big variation in overlap between different data sources. In this paper we present OASIS, an {Online} query {Answering} {System} for {overlappIng} {Sources}. OASIS has three key components for source ordering. First, the Overlap Estimation component estimates overlaps between sources according to available statistics under the Maximum Entropy principle. Second, the Source Ordering component orders the sources according to the new contribution they are expected to provide, and adjusts the ordering based on statistics collected during query answering. Third, the Statistics Enrichment component selects critical missing statistics to enrich at runtime. Experimental results on both real and synthetic data show high efficiency and scalability of our algorithm.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2013:MQO, author = "Guoping Wang and Chee-Yong Chan", title = "Multi-query optimization in {MapReduce} framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "3", pages = "145--156", month = nov, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:00 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "MapReduce has recently emerged as a new paradigm for large-scale data analysis due to its high scalability, fine-grained fault tolerance and easy programming model. Since different jobs often share similar work (e.g., several jobs scan the same input file or produce the same map output), there are many opportunities to optimize the performance for a batch of jobs. In this paper, we propose two new techniques for multi-job optimization in the MapReduce framework. The first is a generalized grouping technique (which generalizes the recently proposed MRShare technique) that merges multiple jobs into a single job thereby enabling the merged jobs to share both the scan of the input file as well as the communication of the common map output. The second is a materialization technique that enables multiple jobs to share both the scan of the input file as well as the communication of the common map output via partial materialization of the map output of some jobs (in the map and/or reduce phase). Our second contribution is the proposal of a new optimization algorithm that given an input batch of jobs, produces an optimal plan by a judicious partitioning of the jobs into groups and an optimal assignment of the processing technique to each group. Our experimental results on Hadoop demonstrate that our new approach significantly outperforms the state-of-the-art technique, MRShare, by up to 107\%.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2013:AAD, author = "Zhenhui Li and Bolin Ding and Fei Wu and Tobias Kin Hou Lei and Roland Kays and Margaret C. Crofoot", title = "Attraction and avoidance detection from movements", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "3", pages = "157--168", month = nov, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:00 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the development of positioning technology, movement data has become widely available nowadays. An important task in movement data analysis is to mine the relationships among moving objects based on their spatiotemporal interactions. Among all relationship types, attraction and avoidance are arguably the most natural ones. However, rather surprisingly, there is no existing method that addresses the problem of mining significant attraction and avoidance relationships in a well-defined and unified framework. In this paper, we propose a novel method to measure the significance value of relationship between any two objects by examining the background model of their movements via permutation test. Since permutation test is computationally expensive, two effective pruning strategies are developed to reduce the computation time. Furthermore, we show how the proposed method can be extended to efficiently answer the classic threshold query: given an object, retrieve all the objects in the database that have relationships, whose significance values are above certain threshold, with the query object. Empirical studies on both synthetic data and real movement data demonstrate the effectiveness and efficiency of our method.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhao:2013:PBA, author = "Xiang Zhao and Chuan Xiao and Xuemin Lin and Qing Liu and Wenjie Zhang", title = "A partition-based approach to structure similarity search", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "3", pages = "169--180", month = nov, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:00 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graphs are widely used to model complex data in many applications, such as bioinformatics, chemistry, social networks, pattern recognition, etc. A fundamental and critical query primitive is to efficiently search similar structures in a large collection of graphs. This paper studies the graph similarity queries with edit distance constraints. Existing solutions to the problem utilize fixed-size overlapping substructures to generate candidates, and thus become susceptible to large vertex degrees or large distance thresholds. In this paper, we present a partition-based approach to tackle the problem. By dividing data graphs into variable-size non-overlapping partitions, the edit distance constraint is converted to a graph containment constraint for candidate generation. We develop efficient query processing algorithms based on the new paradigm. A candidate pruning technique and an improved graph edit distance algorithm are also developed to further boost the performance. In addition, a cost-aware graph partitioning technique is devised to optimize the index. Extensive experiments demonstrate our approach significantly outperforms existing approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bailis:2013:HAT, author = "Peter Bailis and Aaron Davidson and Alan Fekete and Ali Ghodsi and Joseph M. Hellerstein and Ion Stoica", title = "Highly available transactions: virtues and limitations", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "3", pages = "181--192", month = nov, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:00 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To minimize network latency and remain online during server failures and network partitions, many modern distributed data storage systems eschew transactional functionality, which provides strong semantic guarantees for groups of multiple operations over multiple data items. In this work, we consider the problem of providing Highly Available Transactions (HATs): transactional guarantees that do not suffer unavailability during system partitions or incur high network latency. We introduce a taxonomy of highly available systems and analyze existing ACID isolation and distributed data consistency guarantees to identify which can and cannot be achieved in HAT systems. This unifies the literature on weak transactional isolation, replica consistency, and highly available systems. We analytically and experimentally quantify the availability and performance benefits of HATs --- often two to three orders of magnitude over wide-area networks --- and discuss their necessary semantic compromises.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tian:2013:TLV, author = "Yuanyuan Tian and Andrey Balmin and Severin Andreas Corsten and Shirish Tatikonda and John McPherson", title = "From {``think like a vertex''} to {``think like a graph''}", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "3", pages = "193--204", month = nov, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:00 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To meet the challenge of processing rapidly growing graph and network data created by modern applications, a number of distributed graph processing systems have emerged, such as Pregel and GraphLab. All these systems divide input graphs into partitions, and employ a ``think like a vertex'' programming model to support iterative graph computation. This vertex-centric model is easy to program and has been proved useful for many graph algorithms. However, this model hides the partitioning information from the users, thus prevents many algorithm-specific optimizations. This often results in longer execution time due to excessive network messages (e.g. in Pregel) or heavy scheduling overhead to ensure data consistency (e.g. in GraphLab). To address this limitation, we propose a new ``think like a graph'' programming paradigm. Under this graph-centric model, the partition structure is opened up to the users, and can be utilized so that communication within a partition can bypass the heavy message passing or scheduling machinery. We implemented this model in a new system, called Giraph++, based on Apache Giraph, an open source implementation of Pregel. We explore the applicability of the graph-centric model to three categories of graph algorithms, and demonstrate its flexibility and superior performance, especially on well-partitioned data. For example, on a web graph with 118 million vertices and 855 million edges, the graph-centric version of connected component detection algorithm runs 63X faster and uses 204X fewer network messages than its vertex-centric counterpart.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Niedermayer:2013:PNN, author = "Johannes Niedermayer and Andreas Z{\"u}fle and Tobias Emrich and Matthias Renz and Nikos Mamoulis and Lei Chen and Hans-Peter Kriegel", title = "Probabilistic nearest neighbor queries on uncertain moving object trajectories", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "3", pages = "205--216", month = nov, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:00 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Nearest neighbor (NN) queries in trajectory databases have received significant attention in the past, due to their applications in spatio-temporal data analysis. More recent work has considered the realistic case where the trajectories are uncertain; however, only simple uncertainty models have been proposed, which do not allow for accurate probabilistic search. In this paper, we fill this gap by addressing probabilistic nearest neighbor queries in databases with uncertain trajectories modeled by stochastic processes, specifically the Markov chain model. We study three nearest neighbor query semantics that take as input a query state or trajectory $q$ and a time interval, and theoretically evaluate their runtime complexity. Furthermore we propose a sampling approach which uses Bayesian inference to guarantee that sampled trajectories conform to the observation data stored in the database. This sampling approach can be used in Monte-Carlo based approximation solutions. We include an extensive experimental study to support our theoretical results.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Karanasos:2013:DSD, author = "Konstantinos Karanasos and Asterios Katsifodimos and Ioana Manolescu", title = "{Delta}: scalable data dissemination under capacity constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "4", pages = "217--228", month = dec, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:02 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In content-based publish-subscribe (pub/sub) systems, users express their interests as queries over a stream of publications. Scaling up content-based pub/sub to very large numbers of subscriptions is challenging: users are interested in low latency, that is, getting subscription results fast, while the pub/sub system provider is mostly interested in scaling, i.e., being able to serve large numbers of subscribers, with low computational resources utilization. We present a novel approach for scalable content-based pub/sub in the presence of constraints on the available CPU and network resources, implemented within our pub/sub system Delta. We achieve scalability by off-loading some subscriptions from the pub/sub server, and leveraging view-based query rewriting to feed these subscriptions from the data accumulated in others. Our main contribution is a novel algorithm for organizing views in a multi-level dissemination network, exploiting view-based rewriting and powerful linear programming capabilities to scale to many views, respect capacity constraints, and minimize latency. The efficiency and effectiveness of our algorithm are confirmed through extensive experiments and a large deployment in a WAN.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Budak:2013:GOD, author = "Ceren Budak and Theodore Georgiou and Divyakant Agrawal and Amr {El Abbadi}", title = "{GeoScope}: online detection of geo-correlated information trends in social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "4", pages = "229--240", month = dec, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:02 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The First Law of Geography states ``Everything is related to everything else, but near things are more related than distant things''. This spatial significance has implications in various applications, trend detection being one of them. In this paper we propose a new algorithmic tool, GeoScope, to detect geo-trends. GeoScope is a data streams solution that detects correlations between topics and locations in a sliding window, in addition to analyzing topics and locations independently. GeoScope offers theoretical guarantees for detecting all trending correlated pairs while requiring only sub-linear space and running time. We perform various human validation tasks to demonstrate the value of GeoScope. The results show that human judges prefer GeoScope to the best performing baseline solution 4:1 in terms of the geographical significance of the presented information. As the Twitter analysis demonstrates, GeoScope successfully filters out topics without geo-intent and detects various local interests such as emergency events, political demonstrations or cultural events. Experiments on Twitter show that GeoScope has perfect recall and near-perfect precision.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Onizuka:2013:OIQ, author = "Makoto Onizuka and Hiroyuki Kato and Soichiro Hidaka and Keisuke Nakano and Zhenjiang Hu", title = "Optimization for iterative queries on {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "4", pages = "241--252", month = dec, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:02 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/pagerank.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose OptIQ, a query optimization approach for iterative queries in distributed environment. OptIQ removes redundant computations among different iterations by extending the traditional techniques of view materialization and incremental view evaluation. First, OptIQ decomposes iterative queries into invariant and variant views, and materializes the former view. Redundant computations are removed by reusing the materialized view among iterations. Second, OptIQ incrementally evaluates the variant view, so that redundant computations are removed by skipping the evaluation on converged tuples in the variant view. We verify the effectiveness of OptIQ through the queries of PageRank and $k$-means clustering on real datasets. The results show that OptIQ achieves high efficiency, up to five times faster than is possible without removing the redundant computations among iterations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shuai:2013:WOS, author = "Hong-Han Shuai and De-Nian Yang and Philip S. Yu and Ming-Syan Chen", title = "Willingness optimization for social group activity", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "4", pages = "253--264", month = dec, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:02 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Studies show that a person is willing to join a social group activity if the activity is interesting, and if some close friends also join the activity as companions. The literature has demonstrated that the interests of a person and the social tightness among friends can be effectively derived and mined from social networking websites. However, even with the above two kinds of information widely available, social group activities still need to be coordinated manually, and the process is tedious and time-consuming for users, especially for a large social group activity, due to complications of social connectivity and the diversity of possible interests among friends. To address the above important need, this paper proposes to automatically select and recommend potential attendees of a social group activity, which could be very useful for social networking websites as a value-added service. We first formulate a new problem, named Willingness mAximization for Social grOup (WASO). This paper points out that the solution obtained by a greedy algorithm is likely to be trapped in a local optimal solution. Thus, we design a new randomized algorithm to effectively and efficiently solve the problem. Given the available computational budgets, the proposed algorithm is able to optimally allocate the resources and find a solution with an approximation ratio. We implement the proposed algorithm in Facebook, and the user study demonstrates that social groups obtained by the proposed algorithm significantly outperform the solutions manually configured by users.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2013:HPS, author = "Lei Cao and Elke A. Rundensteiner", title = "High performance stream query processing with correlation-aware partitioning", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "4", pages = "265--276", month = dec, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:02 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "State-of-the-art optimizers produce one single optimal query plan for all stream data, in spite of such a singleton plan typically being sub-optimal or even poor for highly correlated data. Recently a new stream processing paradigm, called multi-route approach, has emerged as a promising approach for tackling this problem. Multi-route first divides data streams into several partitions and then creates a separate query plan for each combination of partitions. Unfortunately current approaches suffer from severe shortcomings, in particular, the lack of an effective partitioning strategy and the prohibitive query optimization expense. In this work we propose the first practical multi-route optimizer named correlation-aware multi-route stream query optimizer (or CMR) that solves both problems. By exploiting both intra- and inter-stream correlations of streams, CMR produces effective partitions without having to undertake repeated expensive query plan generation. The produced partitions not only are best served by distinct optimal query plans, but also leverage the partition-driven pruning opportunity. Experimental results with both synthetic and real life stream data confirm that CMR outperforms the state-of-the-art solutions up to an order of magnitude in both the query optimization time and the run-time execution performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Difallah:2013:OBE, author = "Djellel Eddine Difallah and Andrew Pavlo and Carlo Curino and Philippe Cudre-Mauroux", title = "{OLTP-Bench}: an extensible testbed for benchmarking relational databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "4", pages = "277--288", month = dec, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:02 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Benchmarking is an essential aspect of any database management system (DBMS) effort. Despite several recent advancements, such as pre-configured cloud database images and database-as-a-service (DBaaS) offerings, the deployment of a comprehensive testing platform with a diverse set of datasets and workloads is still far from being trivial. In many cases, researchers and developers are limited to a small number of workloads to evaluate the performance characteristics of their work. This is due to the lack of a universal benchmarking infrastructure, and to the difficulty of gaining access to real data and workloads. This results in lots of unnecessary engineering efforts and makes the performance evaluation results difficult to compare. To remedy these problems, we present OLTP-Bench, an extensible ``batteries included'' DBMS benchmarking testbed. The key contributions of OLTP-Bench are its ease of use and extensibility, support for tight control of transaction mixtures, request rates, and access distributions over time, as well as the ability to support all major DBMSs and DBaaS platforms. Moreover, it is bundled with fifteen workloads that all differ in complexity and system demands, including four synthetic workloads, eight workloads from popular benchmarks, and three workloads that are derived from real-world applications. We demonstrate through a comprehensive set of experiments conducted on popular DBMS and DBaaS offerings the different features provided by OLTP-Bench and the effectiveness of our testbed in characterizing the performance of database services.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nandi:2013:GQS, author = "Arnab Nandi and Lilong Jiang and Michael Mandel", title = "Gestural query specification", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "4", pages = "289--300", month = dec, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:02 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Direct, ad-hoc interaction with databases has typically been performed over console-oriented conversational interfaces using query languages such as SQL. With the rise in popularity of gestural user interfaces and computing devices that use gestures as their exclusive modes of interaction, database query interfaces require a fundamental rethinking to work without keyboards. We present a novel query specification system that allows the user to query databases using a series of gestures. We present a novel gesture recognition system that uses both the interaction and the state of the database to classify gestural input into relational database queries. We conduct exhaustive systems performance tests and user studies to demonstrate that our system is not only performant and capable of interactive latencies, but it is also more usable, faster to use and more intuitive than existing systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Heise:2013:SDU, author = "Arvid Heise and Jorge-Arnulfo Quian{\'e}-Ruiz and Ziawasch Abedjan and Anja Jentzsch and Felix Naumann", title = "Scalable discovery of unique column combinations", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "4", pages = "301--312", month = dec, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:02 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The discovery of all unique (and non-unique) column combinations in a given dataset is at the core of any data profiling effort. The results are useful for a large number of areas of data management, such as anomaly detection, data integration, data modeling, duplicate detection, indexing, and query optimization. However, discovering all unique and non-unique column combinations is an NP-hard problem, which in principle requires to verify an exponential number of column combinations for uniqueness on all data values. Thus, achieving efficiency and scalability in this context is a tremendous challenge by itself. In this paper, we devise Ducc, a scalable and efficient approach to the problem of finding all unique and non-unique column combinations in big datasets. We first model the problem as a graph coloring problem and analyze the pruning effect of individual combinations. We then present our hybrid column-based pruning technique, which traverses the lattice in a depth-first and random walk combination. This strategy allows Ducc to typically depend on the solution set size and hence to prune large swaths of the lattice. Ducc also incorporates row-based pruning to run uniqueness checks in just few milliseconds. To achieve even higher scalability, Ducc runs on several CPU cores (scale-up) and compute nodes (scale-out) with a very low overhead. We exhaustively evaluate Ducc using three datasets (two real and one synthetic) with several millions rows and hundreds of attributes. We compare Ducc with related work: Gordian and HCA. The results show that Ducc is up to more than 2 orders of magnitude faster than Gordian and HCA (631x faster than Gordian and 398x faster than HCA). Finally, a series of scalability experiments shows the efficiency of Ducc to scale up and out.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tang:2013:EMD, author = "Yu Tang and Leong Hou U. and Yilun Cai and Nikos Mamoulis and Reynold Cheng", title = "{Earth Mover's Distance} based similarity search at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "4", pages = "313--324", month = dec, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:02 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Earth Mover's Distance (EMD), as a similarity measure, has received a lot of attention in the fields of multimedia and probabilistic databases, computer vision, image retrieval, machine learning, etc. EMD on multidimensional histograms provides better distinguishability between the objects approximated by the histograms (e.g., images), compared to classic measures like Euclidean distance. Despite its usefulness, EMD has a high computational cost; therefore, a number of effective filtering methods have been proposed, to reduce the pairs of histograms for which the exact EMD has to be computed, during similarity search. Still, EMD calculations in the refinement step remain the bottleneck of the whole similarity search process. In this paper, we focus on optimizing the refinement phase of EMD-based similarity search by (i) adapting an efficient min-cost flow algorithm (SIA) for EMD computation, (ii) proposing a dynamic distance bound, which can be used to terminate an EMD refinement early, and (iii) proposing a dynamic refinement order for the candidates which, paired with a concurrent EMD refinement strategy, reduces the amount of needless computations. Our proposed techniques are orthogonal to and can be easily integrated with the state-of-the-art filtering techniques, reducing the cost of EMD-based similarity queries by orders of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Parameswaran:2013:SVD, author = "Aditya Parameswaran and Neoklis Polyzotis and Hector Garcia-Molina", title = "{SeeDB}: visualizing database queries efficiently", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "4", pages = "325--328", month = dec, year = "2013", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:02 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data scientists rely on visualizations to interpret the data returned by queries, but finding the right visualization remains a manual task that is often laborious. We propose a DBMS that partially automates the task of finding the right visualizations for a query. In a nutshell, given an input query Q, the new DBMS optimizer will explore not only the space of physical plans for Q, but also the space of possible visualizations for the results of Q. The output will comprise a recommendation of potentially ``interesting'' or ``useful'' visualizations, where each visualization is coupled with a suitable query execution plan. We discuss the technical challenges in building this system and outline an agenda for future research.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mahmoud:2014:MES, author = "Hatem A. Mahmoud and Vaibhav Arora and Faisal Nawab and Divyakant Agrawal and Amr {El Abbadi}", title = "{MaaT}: effective and scalable coordination of distributed transactions in the cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "5", pages = "329--340", month = jan, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:04 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The past decade has witnessed an increasing adoption of cloud database technology, which provides better scalability, availability, and fault-tolerance via transparent partitioning and replication, and automatic load balancing and fail-over. However, only a small number of cloud databases provide strong consistency guarantees for distributed transactions, despite decades of research on distributed transaction processing, due to practical challenges that arise in the cloud setting, where failures are the norm, and human administration is minimal. For example, dealing with locks left by transactions initiated by failed machines, and determining a multi-programming level that avoids thrashing without under-utilizing available resources, are some of the challenges that arise when using lock-based transaction processing mechanisms in the cloud context. Even in the case of optimistic concurrency control, most proposals in the literature deal with distributed validation but still require the database to acquire locks during two-phase commit when installing updates of a single transaction on multiple machines. Very little theoretical work has been done to entirely eliminate the need for locking in distributed transactions, including locks acquired during two-phase commit. In this paper, we re-design optimistic concurrency control to eliminate any need for locking even for atomic commitment, while handling the practical issues in earlier theoretical work related to this problem. We conduct an extensive experimental study to evaluate our approach against lock-based methods under various setups and workloads, and demonstrate that our approach provides many practical advantages in the cloud context.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2014:DWA, author = "Chao Li and Michael Hay and Gerome Miklau and Yue Wang", title = "A data- and workload-aware algorithm for range queries under differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "5", pages = "341--352", month = jan, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:04 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We describe a new algorithm for answering a given set of range queries under $ \epsilon $-differential privacy which often achieves substantially lower error than competing methods. Our algorithm satisfies differential privacy by adding noise that is adapted to the input data and to the given query set. We first privately learn a partitioning of the domain into buckets that suit the input data well. Then we privately estimate counts for each bucket, doing so in a manner well-suited for the given query set. Since the performance of the algorithm depends on the input database, we evaluate it on a wide range of real datasets, showing that we can achieve the benefits of data-dependence on both ``easy'' and ``hard'' databases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Greco:2014:CQA, author = "Sergio Greco and Fabian Pijcke and Jef Wijsen", title = "Certain query answering in partially consistent databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "5", pages = "353--364", month = jan, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:04 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A database is called uncertain if two or more tuples of the same relation are allowed to agree on their primary key. Intuitively, such tuples act as alternatives for each other. A repair (or possible world) of such uncertain database is obtained by selecting a maximal number of tuples without ever selecting two tuples of the same relation that agree on their primary key. For a Boolean query $q$, the problem $ {\rm CERTAINTY}(q)$ takes as input an uncertain database db and asks whether $q$ evaluates to true on every repair of db. In recent years, the complexity of $ {\rm CERTAINTY}(q)$ has been studied under different restrictions on $q$. These complexity studies have assumed no restrictions on the uncertain databases that are input to $ {\rm CERTAINTY}(q)$. In practice, however, it may be known that these input databases are partially consistent, in the sense that they satisfy some dependencies (e.g., functional dependencies). In this article, we introduce the problem $ {\rm CERTAINTY}(q)$ in the presence of a set $ \Sigma $ of dependencies. The problem $ {\rm CERTAINTY}(q, \Sigma)$ takes as input an uncertain database db that satisfies $ \Sigma $, and asks whether every repair of db satisfies $q$. We focus on the complexity of $ {\rm CERTAINTY}(q, \Sigma)$ when $q$ is an acyclic conjunctive query without self-join, and $ \Sigma $ is a set of functional dependencies and join dependencies, the latter of a particular form. We provide an algorithm that, given $q$ and $ \Sigma $, decides whether $ {\rm CERTAINTY}(q, \Sigma)$ is first-order expressible. Moreover, we show how to effectively construct a first-order definition of $ {\rm CERTAINTY}(q, \Sigma)$ if it exists.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mottin:2014:EQG, author = "Davide Mottin and Matteo Lissandrini and Yannis Velegrakis and Themis Palpanas", title = "Exemplar queries: give me an example of what you need", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "5", pages = "365--376", month = jan, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:04 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Search engines are continuously employing advanced techniques that aim to capture user intentions and provide results that go beyond the data that simply satisfy the query conditions. Examples include the personalized results, related searches, similarity search, popular and relaxed queries. In this work we introduce a novel query paradigm that considers a user query as an example of the data in which the user is interested. We call these queries exemplar queries and claim that they can play an important role in dealing with the information deluge. We provide a formal specification of the semantics of such queries and show that they are fundamentally different from notions like queries by example, approximate and related queries. We provide an implementation of these semantics for graph-based data and present an exact solution with a number of optimizations that improve performance without compromising the quality of the answers. We also provide an approximate solution that prunes the search space and achieves considerably better time-performance with minimal or no impact on effectiveness. We experimentally evaluate the effectiveness and efficiency of these solutions with synthetic and real datasets, and illustrate the usefulness of exemplar queries in practice.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Korula:2014:ERA, author = "Nitish Korula and Silvio Lattanzi", title = "An efficient reconciliation algorithm for social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "5", pages = "377--388", month = jan, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:04 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "People today typically use multiple online social networks (Facebook, Twitter, Google+, LinkedIn, etc.). Each online network represents a subset of their ``real'' ego-networks. An interesting and challenging problem is to reconcile these online networks, that is, to identify all the accounts belonging to the same individual. Besides providing a richer understanding of social dynamics, the problem has a number of practical applications. At first sight, this problem appears algorithmically challenging. Fortunately, a small fraction of individuals explicitly link their accounts across multiple networks; our work leverages these connections to identify a very large fraction of the network. Our main contributions are to mathematically formalize the problem for the first time, and to design a simple, local, and efficient parallel algorithm to solve it. We are able to prove strong theoretical guarantees on the algorithm's performance on well-established network models (Random Graphs, Preferential Attachment). We also experimentally confirm the effectiveness of the algorithm on synthetic and real social network data sets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chester:2014:CKR, author = "Sean Chester and Alex Thomo and S. Venkatesh and Sue Whitesides", title = "Computing $k$-regret minimizing sets", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "5", pages = "389--400", month = jan, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:04 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Regret minimizing sets are a recent approach to representing a dataset $D$ by a small subset $R$ of size $r$ of representative data points. The set $R$ is chosen such that executing any top-1 query on $R$ rather than $D$ is minimally perceptible to any user. However, such a subset $R$ may not exist, even for modest sizes, $r$. In this paper, we introduce the relaxation to $k$-regret minimizing sets, whereby a top-$1$ query on $R$ returns a result imperceptibly close to the top-$k$ on $D$. We show that, in general, with or without the relaxation, this problem is NP-hard. For the specific case of two dimensions, we give an efficient dynamic programming, plane sweep algorithm based on geometric duality to find an optimal solution. For arbitrary dimension, we give an empirically effective, greedy, randomized algorithm based on linear programming. With these algorithms, we can find subsets $R$ of much smaller size that better summarize $D$, using small values of $k$ larger than $1$.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yu:2014:RTK, author = "Adams Wei Yu and Nikos Mamoulis and Hao Su", title = "Reverse top-$k$ search using random walk with restart", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "5", pages = "401--412", month = jan, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:04 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the increasing popularity of social networks, large volumes of graph data are becoming available. Large graphs are also derived by structure extraction from relational, text, or scientific data (e.g., relational tuple networks, citation graphs, ontology networks, protein-protein interaction graphs). Node-to-node proximity is the key building block for many graph-based applications that search or analyze the data. Among various proximity measures, random walk with restart (RWR) is widely adopted because of its ability to consider the global structure of the whole network. Although RWR-based similarity search has been well studied before, there is no prior work on reverse top-$k$ proximity search in graphs based on RWR. We discuss the applicability of this query and show that its direct evaluation using existing methods on RWR-based similarity search has very high computational and storage demands. To address this issue, we propose an indexing technique, paired with an on-line reverse top-$k$ search algorithm. Our experiments show that our technique is efficient and has manageable storage requirements even when applied on very large graphs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Viglas:2014:WLS, author = "Stratis D. Viglas", title = "Write-limited sorts and joins for persistent memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "5", pages = "413--424", month = jan, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:04 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To mitigate the impact of the widening gap between the memory needs of CPUs and what standard memory technology can deliver, system architects have introduced a new class of memory technology termed persistent memory. Persistent memory is byte-addressable, but exhibits asymmetric I/O: writes are typically one order of magnitude more expensive than reads. Byte addressability combined with I/O asymmetry render the performance profile of persistent memory unique. Thus, it becomes imperative to find new ways to seamlessly incorporate it into database systems. We do so in the context of query processing. We focus on the fundamental operations of sort and join processing. We introduce the notion of write-limited algorithms that effectively minimize the I/O cost. We give a high-level API that enables the system to dynamically optimize the workflow of the algorithms; or, alternatively, allows the developer to tune the write profile of the algorithms. We present four different techniques to incorporate persistent memory into the database processing stack in light of this API. We have implemented and extensively evaluated all our proposals. Our results show that the algorithms deliver on their promise of I/O-minimality and tunable performance. We showcase the merits and deficiencies of each implementation technique, thus taking a solid first step towards incorporating persistent memory into query processing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Anciaux:2014:FOD, author = "N. Anciaux and L. Bouganim and T. Delot and S. Ilarri and L. Kloul and N. Mitton and P. Pucheral", title = "{Folk-IS}: opportunistic data services in least developed countries", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "5", pages = "425--428", month = jan, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:04 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "According to a wide range of studies, IT should become a key facilitator in establishing primary education, reducing mortality and supporting commercial initiatives in Least Developed Countries (LDCs). The main barrier to the development of IT services in these regions is not only the lack of communication facilities, but also the lack of consistent information systems, security procedures, economic and legal support, as well as political commitment. In this paper, we propose the vision of an infrastructureless data platform well suited for the development of innovative IT services in LDCs. We propose a participatory approach, where each individual implements a small subset of a complete information system thanks to highly secure, portable and low-cost personal devices as well as opportunistic networking, without the need of any form of infrastructure. We review the technical challenges that are specific to this approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Giannikis:2014:SWO, author = "Georgios Giannikis and Darko Makreshanski and Gustavo Alonso and Donald Kossmann", title = "Shared workload optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "6", pages = "429--440", month = feb, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:06 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As a result of increases in both the query load and the data managed, as well as changes in hardware architecture (multicore), the last years have seen a shift from query-at-a-time approaches towards shared work (SW) systems where queries are executed in groups. Such groups share operators like scans and joins, leading to systems that process hundreds to thousands of queries in one go. SW systems range from storage engines that use in-memory co-operative scans to more complex query processing engines that share joins over analytical and star schema queries. In all cases, they rely on either single query optimizers, predicate sharing, or on manually generated plans. In this paper we explore the problem of shared workload optimization (SWO) for SW systems. The challenge in doing so is that the optimization has to be done for the entire workload and that results in a class of stochastic knapsack with uncertain weights optimization, which can only be addressed with heuristics to achieve a reasonable runtime. In this paper we focus on hash joins and shared scans and present a first algorithm capable of optimizing the execution of entire workloads by deriving a global executing plan for all the queries in the system. We evaluate the optimizer over the TPC-W and the TPC-H benchmarks. The results prove the feasibility of this approach and demonstrate the performance gains that can be obtained from SW systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Elseidy:2014:SAO, author = "Mohammed Elseidy and Abdallah Elguindy and Aleksandar Vitorovic and Christoph Koch", title = "Scalable and adaptive online joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "6", pages = "441--452", month = feb, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:06 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Scalable join processing in a parallel shared-nothing environment requires a partitioning policy that evenly distributes the processing load while minimizing the size of state maintained and number of messages communicated. Previous research proposes static partitioning schemes that require statistics beforehand. In an online or streaming environment in which no statistics about the workload are known, traditional static approaches perform poorly. This paper presents a novel parallel online dataflow join operator that supports arbitrary join predicates. The proposed operator continuously adjusts itself to the data dynamics through adaptive dataflow routing and state repartitioning. The operator is resilient to data skew, maintains high throughput rates, avoids blocking behavior during state repartitioning, takes an eventual consistency approach for maintaining its local state, and behaves strongly consistently as a black-box dataflow operator. We prove that the operator ensures a constant competitive ratio 3:75 in data distribution optimality and that the cost of processing an input tuple is amortized constant, taking into account adaptivity costs. Our evaluation demonstrates that our operator outperforms the state-of-the-art static partitioning schemes in resource utilization, throughput, and execution time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Morton:2014:SDE, author = "Kristi Morton and Magdalena Balazinska and Dan Grossman and Jock Mackinlay", title = "Support the data enthusiast: challenges for next-generation data-analysis systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "6", pages = "453--456", month = feb, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:06 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present a vision of next-generation visual analytics services. We argue that these services should have three related capabilities: support visual and interactive data exploration as they do today, but also suggest relevant data to enrich visualizations, and facilitate the integration and cleaning of that data. Most importantly, they should provide all these capabilities seamlessly in the context of an uninterrupted data analysis cycle. We present the challenges and opportunities in building next-generation visual analytics services.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deutch:2014:PFD, author = "Daniel Deutch and Yuval Moskovitch and Val Tannen", title = "A provenance framework for data-dependent process analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "6", pages = "457--468", month = feb, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:06 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A data-dependent process (DDP) models an application whose control flow is guided by a finite state machine, as well as by the state of an underlying database. DDPs are commonly found e.g., in e-commerce. In this paper we develop a framework supporting the use of provenance in static (temporal) analysis of possible DDP executions. Using provenance support, analysts can interactively test and explore the effect of hypothetical modifications to a DDP's state machine and/or to the underlying database. They can also extend the analysis to incorporate the propagation of annotations from meta-domains of interest, e.g., cost or access privileges. Toward this goal we note that the framework of semiring-based provenance was proven highly effective in fulfilling similar needs in the context of database queries. In this paper we consider novel constructions that generalize the semiring approach to the context of DDP analysis. These constructions address two interacting new challenges: (1) to combine provenance annotations for both information that resides in the database and information about external inputs (e.g., user choices), and (2) to finitely capture infinite process executions. We analyze our solution from theoretical and experimental perspectives, proving its effectiveness.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chiang:2014:TED, author = "Yueh-Hsuan Chiang and AnHai Doan and Jeffrey F. Naughton", title = "Tracking entities in the dynamic world: a fast algorithm for matching temporal records", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "6", pages = "469--480", month = feb, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:06 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Identifying records referring to the same real world entity over time enables longitudinal data analysis. However, difficulties arise from the dynamic nature of the world: the entities described by a temporal data set often evolve their states over time. While the state of the art approach to temporal entity matching achieves high accuracy, this approach is computationally expensive and cannot handle large data sets. In this paper, we present an approach that achieves equivalent matching accuracy but takes far less time. Our key insight is ``static first, dynamic second.'' Our approach first runs an evidence-collection pass, grouping records without considering the possibility of entity evolution, as if the world were ``static.'' Then, it merges clusters from the initial grouping by determining whether an entity might evolve from the state described in one cluster to the state described in another cluster. This intuitively reduces a difficult problem, record matching with evolution, to two simpler problems: record matching without evolution, then ``evolution detection'' among the resulting clusters. Experimental results on several temporal data sets show that our approach provides an order of magnitude improvement in run time over the state-of-the-art approach while producing equivalent matching accuracy.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Conway:2014:EAS, author = "Neil Conway and Peter Alvaro and Emily Andrews and Joseph M. Hellerstein", title = "{Edelweiss}: automatic storage reclamation for distributed programming", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "6", pages = "481--492", month = feb, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:06 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Event Log Exchange (ELE) is a common programming pattern based on immutable state and messaging. ELE sidesteps traditional challenges in distributed consistency, at the expense of introducing new challenges in designing space reclamation protocols to avoid consuming unbounded storage. We introduce Edelweiss, a sublanguage of Bloom that provides an ELE programming model, yet automatically reclaims space without programmer assistance. We describe techniques to analyze Edelweiss programs and automatically generate application-specific distributed space reclamation logic. We show how Edelweiss can be used to elegantly implement a variety of communication and distributed storage protocols; the storage reclamation code generated by Edelweiss effectively garbage-collects state and often matches hand-written protocols from the literature.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ntarmos:2014:RJQ, author = "Nikos Ntarmos and Ioannis Patlakas and Peter Triantafillou", title = "Rank join queries in {NoSQL} databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "7", pages = "493--504", month = mar, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:07 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Rank (i.e., top-$k$) join queries play a key role in modern analytics tasks. However, despite their importance and unlike centralized settings, they have been completely overlooked in cloud NoSQL settings. We attempt to fill this gap: We contribute a suite of solutions and study their performance comprehensively. Baseline solutions are offered using SQL-like languages (like Hive and Pig), based on MapReduce jobs. We first provide solutions that are based on specialized indices, which may themselves be accessed using either MapReduce or coordinator-based strategies. The first index-based solution is based on inverted indices, which are accessed with MapReduce jobs. The second index-based solution adapts a popular centralized rank-join algorithm. We further contribute a novel statistical structure comprising histograms and Bloom filters, which forms the basis for the third index-based solution. We provide (i) MapReduce algorithms showing how to build these indices and statistical structures, (ii) algorithms to allow for online updates to these indices, and (iii) query processing algorithms utilizing them. We implemented all algorithms in Hadoop (HDFS) and HBase and tested them on TPC-H datasets of various scales, utilizing different queries on tables of various sizes and different score-attribute distributions. We ported our implementations to Amazon EC2 and ``in-house'' lab clusters of various scales. We provide performance results for three metrics: query execution time, network bandwidth consumption, and dollar-cost for query execution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gupta:2014:BOS, author = "Rahul Gupta and Alon Halevy and Xuezhi Wang and Steven Euijong Whang and Fei Wu", title = "{Biperpedia}: an ontology for search applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "7", pages = "505--516", month = mar, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:07 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Search engines make significant efforts to recognize queries that can be answered by structured data and invest heavily in creating and maintaining high-precision databases. While these databases have a relatively wide coverage of entities, the number of attributes they model (e.g., GDP, CAPITAL, ANTHEM) is relatively small. Extending the number of attributes known to the search engine can enable it to more precisely answer queries from the long and heavy tail, extract a broader range of facts from the Web, and recover the semantics of tables on the Web. We describe Biperpedia, an ontology with 1.6M (class, attribute) pairs and 67K distinct attribute names. Biperpedia extracts attributes from the query stream, and then uses the best extractions to seed attribute extraction from text. For every attribute Biperpedia saves a set of synonyms and text patterns in which it appears, thereby enabling it to recognize the attribute in more contexts. In addition to a detailed analysis of the quality of Biperpedia, we show that it can increase the number of Web tables whose semantics we can recover by more than a factor of 4 compared with Freebase.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Elseidy:2014:GFS, author = "Mohammed Elseidy and Ehab Abdelhamid and Spiros Skiadopoulos and Panos Kalnis", title = "{GraMi}: frequent subgraph and pattern mining in a single large graph", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "7", pages = "517--528", month = mar, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:07 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Mining frequent subgraphs is an important operation on graphs; it is defined as finding all subgraphs that appear frequently in a database according to a given frequency threshold. Most existing work assumes a database of many small graphs, but modern applications, such as social networks, citation graphs, or protein-protein interactions in bioinformatics, are modeled as a single large graph. In this paper we present GraMi, a novel framework for frequent subgraph mining in a single large graph. GraMi undertakes a novel approach that only finds the minimal set of instances to satisfy the frequency threshold and avoids the costly enumeration of all instances required by previous approaches. We accompany our approach with a heuristic and optimizations that significantly improve performance. Additionally, we present an extension of GraMi that mines frequent patterns. Compared to subgraphs, patterns offer a more powerful version of matching that captures transitive interactions between graph nodes (like friend of a friend) which are very common in modern applications. Finally, we present CGraMi, a version supporting structural and semantic constraints, and AGraMi, an approximate version producing results with no false positives. Our experiments on real data demonstrate that our framework is up to 2 orders of magnitude faster and discovers more interesting patterns than existing approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2014:LIO, author = "Sheng Wang and David Maier and Beng Chin Ooi", title = "Lightweight indexing of observational data in log-structured storage", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "7", pages = "529--540", month = mar, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:07 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Huge amounts of data are being generated by sensing devices every day, recording the status of objects and the environment. Such observational data is widely used in scientific research. As the capabilities of sensors keep improving, the data produced are drastically expanding in precision and quantity, making it a write-intensive domain. Log-structured storage is capable of providing high write throughput, and hence is a natural choice for managing large-scale observational data. In this paper, we propose an approach to indexing and querying observational data in log-structured storage. Based on key traits of observational data, we design a novel index approach called the CR-index (Continuous Range Index), which provides fast query performance without compromising write throughput. It is a lightweight structure that is fast to construct and often small enough to reside in RAM. Our experimental results show that the CR-index is superior in handling observational data compared to other indexing techniques. While our focus is scientific data, we believe our index will be effective for other applications with similar properties, such as process monitoring in manufacturing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jiang:2014:EES, author = "Dawei Jiang and Gang Chen and Beng Chin Ooi and Kian-Lee Tan and Sai Wu", title = "{epiC}: an extensible and scalable system for processing big data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "7", pages = "541--552", month = mar, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:07 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Big Data problem is characterized by the so called 3V features: Volume --- a huge amount of data, Velocity --- a high data ingestion rate, and Variety --- a mix of structured data, semi-structured data, and unstructured data. The state-of-the-art solutions to the Big Data problem are largely based on the MapReduce framework (aka its open source implementation Hadoop). Although Hadoop handles the data volume challenge successfully, it does not deal with the data variety well since the programming interfaces and its associated data processing model is inconvenient and inefficient for handling structured data and graph data. This paper presents epiC, an extensible system to tackle the Big Data's data variety challenge. epiC introduces a general Actor-like concurrent programming model, independent of the data processing models, for specifying parallel computations. Users process multi-structured datasets with appropriate epiC extensions, the implementation of a data processing model best suited for the data type and auxiliary code for mapping that data processing model into epiC's concurrent programming model. Like Hadoop, programs written in this way can be automatically parallelized and the runtime system takes care of fault tolerance and inter-machine communications. We present the design and implementation of epiC's concurrent programming model. We also present two customized data processing model, an optimized MapReduce extension and a relational model, on top of epiC. Experiments demonstrate the effectiveness and efficiency of our proposed epiC.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Boehm:2014:HPS, author = "Matthias Boehm and Shirish Tatikonda and Berthold Reinwald and Prithviraj Sen and Yuanyuan Tian and Douglas R. Burdick and Shivakumar Vaithyanathan", title = "Hybrid parallelization strategies for large-scale machine learning in {SystemML}", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "7", pages = "553--564", month = mar, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:07 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "SystemML aims at declarative, large-scale machine learning (ML) on top of MapReduce, where high-level ML scripts with R-like syntax are compiled to programs of MR jobs. The declarative specification of ML algorithms enables --- in contrast to existing large-scale machine learning libraries --- automatic optimization. SystemML's primary focus is on data parallelism but many ML algorithms inherently exhibit opportunities for task parallelism as well. A major challenge is how to efficiently combine both types of parallelism for arbitrary ML scripts and workloads. In this paper, we present a systematic approach for combining task and data parallelism for large-scale machine learning on top of MapReduce. We employ a generic Parallel FOR construct (ParFOR) as known from high performance computing (HPC). Our core contributions are (1) complementary parallelization strategies for exploiting multi-core and cluster parallelism, as well as (2) a novel cost-based optimization framework for automatically creating optimal parallel execution plans. Experiments on a variety of use cases showed that this achieves both efficiency and scalability due to automatic adaptation to ad-hoc workloads and unknown data characteristics.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2014:SSG, author = "Shengqi Yang and Yinghui Wu and Huan Sun and Xifeng Yan", title = "Schemaless and structureless graph querying", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "7", pages = "565--576", month = mar, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:07 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Querying complex graph databases such as knowledge graphs is a challenging task for non-professional users. Due to their complex schemas and variational information descriptions, it becomes very hard for users to formulate a query that can be properly processed by the existing systems. We argue that for a user-friendly graph query engine, it must support various kinds of transformations such as synonym, abbreviation, and ontology. Furthermore, the derived query results must be ranked in a principled manner. In this paper, we introduce a novel framework enabling schemaless and structureless graph querying (SLQ), where a user need not describe queries precisely as required by most databases. The query engine is built on a set of transformation functions that automatically map keywords and linkages from a query to their matches in a graph. It automatically learns an effective ranking model, without assuming manually labeled training examples, and can efficiently return top ranked matches using graph sketch and belief propagation. The architecture of SLQ is elastic for ``plug-in'' new transformation functions and query logs. Our experimental results show that this new graph querying paradigm is promising: It identifies high-quality matches for both keyword and graph queries over real-life knowledge graphs, and outperforms existing methods significantly in terms of effectiveness and efficiency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Salihoglu:2014:OGA, author = "Semih Salihoglu and Jennifer Widom", title = "Optimizing graph algorithms on {Pregel}-like systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "7", pages = "577--588", month = mar, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:07 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the problem of implementing graph algorithms efficiently on Pregel-like systems, which can be surprisingly challenging. Standard graph algorithms in this setting can incur unnecessary inefficiencies such as slow convergence or high communication or computation cost, typically due to structural properties of the input graphs such as large diameters or skew in component sizes. We describe several optimization techniques to address these inefficiencies. Our most general technique is based on the idea of performing some serial computation on a tiny fraction of the input graph, complementing Pregel's vertex-centric parallelism. We base our study on thorough implementations of several fundamental graph algorithms, some of which have, to the best of our knowledge, not been implemented on Pregel-like systems before. The algorithms and optimizations we describe are fully implemented in our open-source Pregel implementation. We present detailed experiments showing that our optimization techniques improve runtime significantly on a variety of very large graph datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2014:TCF, author = "You Wu and Pankaj K. Agarwal and Chengkai Li and Jun Yang and Cong Yu", title = "Toward computational fact-checking", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "7", pages = "589--600", month = mar, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:07 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Our news are saturated with claims of ``facts'' made from data. Database research has in the past focused on how to answer queries, but has not devoted much attention to discerning more subtle qualities of the resulting claims, e.g., is a claim ``cherry-picking''? This paper proposes a framework that models claims based on structured data as parameterized queries. A key insight is that we can learn a lot about a claim by perturbing its parameters and seeing how its conclusion changes. This framework lets us formulate practical fact-checking tasks --- reverse-engineering (often intentionally) vague claims, and countering questionable claims --- as computational problems. Along with the modeling framework, we develop an algorithmic framework that enables efficient instantiations of ``meta'' algorithms by supplying appropriate algorithmic building blocks. We present real-world examples and experiments that demonstrate the power of our model, efficiency of our algorithms, and usefulness of their results.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Arenas:2014:PAB, author = "Marcelo Arenas and Gonzalo D{\'\i}az and Achille Fokoue and Anastasios Kementsietsidis and Kavitha Srinivas", title = "A principled approach to bridging the gap between graph data and their schemas", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "8", pages = "601--612", month = apr, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:10 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Although RDF graph data often come with an associated schema, recent studies have proven that real RDF data rarely conform to their perceived schemas. Since a number of data management decisions, including storage layouts, indexing, and efficient query processing, use schemas to guide the decision making, it is imperative to have an accurate description of the structuredness of the data at hand (how well the data conform to the schema). In this paper, we have approached the study of the structuredness of an RDF graph in a principled way: we propose a framework for specifying structuredness functions, which gauge the degree to which an RDF graph conforms to a schema. In particular, we first define a formal language for specifying structuredness functions with expressions we call rules. This language allows a user to state a rule to which an RDF graph may fully or partially conform. Then we consider the issue of discovering a refinement of a sort (type) by partitioning the dataset into subsets whose structuredness is over a specified threshold. In particular, we prove that the natural decision problem associated to this refinement problem is NP-complete, and we provide a natural translation of this problem into Integer Linear Programming (ILP). Finally, we test this ILP solution with three real world datasets and three different and intuitive rules, which gauge the structuredness in different ways. We show that the rules give meaningful refinements of the datasets, showing that our language can be a powerful tool for understanding the structure of RDF data, and we show that the ILP solution is practical for a large fraction of existing data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2014:EPS, author = "Dongxiang Zhang and Chee-Yong Chan and Kian-Lee Tan", title = "An efficient publish\slash subscribe index for e-commerce databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "8", pages = "613--624", month = apr, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:10 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many of today's publish/subscribe (pub/sub) systems have been designed to cope with a large volume of subscriptions and high event arrival rate (velocity). However, in many novel applications (such as e-commerce), there is an increasing variety of items, each with different attributes. This leads to a very high-dimensional and sparse database that existing pub/sub systems can no longer support effectively. In this paper, we propose an efficient in-memory index that is scalable to the volume and update of subscriptions, the arrival rate of events and the variety of subscribable attributes. The index is also extensible to support complex scenarios such as prefix/suffix filtering and regular expression matching. We conduct extensive experiments on synthetic datasets and two real datasets (AOL query log and Ebay products). The results demonstrate the superiority of our index over state-of-the-art methods: our index incurs orders of magnitude less index construction time, consumes a small amount of memory and performs event matching efficiently.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jiang:2014:SSJ, author = "Yu Jiang and Guoliang Li and Jianhua Feng and Wen-Syan Li", title = "String similarity joins: an experimental evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "8", pages = "625--636", month = apr, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:10 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/string-matching.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "String similarity join is an important operation in data integration and cleansing that finds similar string pairs from two collections of strings. More than ten algorithms have been proposed to address this problem in the recent two decades. However, existing algorithms have not been thoroughly compared under the same experimental framework. For example, some algorithms are tested only on specific datasets. This makes it rather difficult for practitioners to decide which algorithms should be used for various scenarios. To address this problem, in this paper we provide a comprehensive survey on a wide spectrum of existing string similarity join algorithms, classify them into different categories based on their main techniques, and compare them through extensive experiments on a variety of real-world datasets with different characteristics. We also report comprehensive findings obtained from the experiments and provide new insights about the strengths and weaknesses of existing similarity join algorithms which can guide practitioners to select appropriate algorithms for various scenarios.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Proserpio:2014:CDS, author = "Davide Proserpio and Sharon Goldberg and Frank McSherry", title = "Calibrating data to sensitivity in private data analysis: a platform for differentially-private analysis of weighted datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "8", pages = "637--648", month = apr, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:10 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present an approach to differentially private computation in which one does not scale up the magnitude of noise for challenging queries, but rather scales down the contributions of challenging records. While scaling down all records uniformly is equivalent to scaling up the noise magnitude, we show that scaling records non-uniformly can result in substantially higher accuracy by bypassing the worst-case requirements of differential privacy for the noise magnitudes. This paper details the data analysis platform wPINQ, which generalizes the Privacy Integrated Query (PINQ) to weighted datasets. Using a few simple operators (including a non-uniformly scaling Join operator) wPINQ can reproduce (and improve) several recent results on graph analysis and introduce new generalizations (e.g., counting triangles with given degrees). We also show how to integrate probabilistic inference techniques to synthesize datasets respecting more complicated (and less easily interpreted) measurements.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2014:EMM, author = "Wei Wang and Beng Chin Ooi and Xiaoyan Yang and Dongxiang Zhang and Yueting Zhuang", title = "Effective multi-modal retrieval based on stacked auto-encoders", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "8", pages = "649--660", month = apr, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 09:22:10 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Multi-modal retrieval is emerging as a new search paradigm that enables seamless information retrieval from various types of media. For example, users can simply snap a movie poster to search relevant reviews and trailers. To solve the problem, a set of mapping functions are learned to project high-dimensional features extracted from data of different media types into a common low-dimensional space so that metric distance measures can be applied. In this paper, we propose an effective mapping mechanism based on deep learning (i.e., stacked auto-encoders) for multi-modal retrieval. Mapping functions are learned by optimizing a new objective function, which captures both intra-modal and inter-modal semantic relationships of data from heterogeneous sources effectively. Compared with previous works which require a substantial amount of prior knowledge such as similarity matrices of intra-modal data and ranking examples, our method requires little prior knowledge. Given a large training dataset, we split it into mini-batches and continually adjust the mapping functions for each batch of input. Hence, our method is memory efficient with respect to the data volume. Experiments on three real datasets illustrate that our proposed method achieves significant improvement in search accuracy over the state-of-the-art methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Song:2014:PNF, author = "Renchu Song and Weiwei Sun and Baihua Zheng and Yu Zheng", title = "{PRESS}: a novel framework of trajectory compression in road networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "9", pages = "661--672", month = may, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:18 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Location data becomes more and more important. In this paper, we focus on the trajectory data, and propose a new framework, namely PRESS (Paralleled Road-Network-Based Trajectory Compression), to effectively compress trajectory data under road network constraints. Different from existing work, PRESS proposes a novel representation for trajectories to separate the spatial representation of a trajectory from the temporal representation, and proposes a Hybrid Spatial Compression (HSC) algorithm and error Bounded Temporal Compression (BTC) algorithm to compress the spatial and temporal information of trajectories respectively. PRESS also supports common spatial-temporal queries without fully decompressing the data. Through an extensive experimental study on real trajectory dataset, PRESS significantly outperforms existing approaches in terms of saving storage cost of trajectory data with bounded errors.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2014:FCO, author = "Yajun Yang and Hong Gao and Jeffrey Xu Yu and Jianzhong Li", title = "Finding the cost-optimal path with time constraint over time-dependent graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "9", pages = "673--684", month = may, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:18 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Shortest path query is an important problem and has been well studied in static graphs. However, in practice, the costs of edges in graphs always change over time. We call such graphs as time-dependent graphs. In this paper, we study how to find a cost-optimal path with time constraint in time-dependent graphs. Most existing works regarding the Time-Dependent Shortest Path (TDSP) problem focus on finding a shortest path with the minimum travel time. All these works are based on the following fact: the earliest arrival time at a vertex $v$ can be derived from the earliest arrival time at $v$'s neighbors. Unfortunately, this fact does not hold for our problem. In this paper, we propose a novel algorithm to compute a cost-optimal path with time constraint in time-dependent graphs. We show that the time and space complexities of our algorithm are $ O(k n \log n + m k)$ and $ O((n + m) k)$ respectively. We confirm the effectiveness and efficiency of our algorithm through conducting experiments on real datasets with synthetic cost.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Parameswaran:2014:OCP, author = "Aditya Parameswaran and Stephen Boyd and Hector Garcia-Molina and Ashish Gupta and Neoklis Polyzotis and Jennifer Widom", title = "Optimal crowd-powered rating and filtering algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "9", pages = "685--696", month = may, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:18 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We focus on crowd-powered filtering, i.e., filtering a large set of items using humans. Filtering is one of the most commonly used building blocks in crowdsourcing applications and systems. While solutions for crowd-powered filtering exist, they make a range of implicit assumptions and restrictions, ultimately rendering them not powerful enough for real-world applications. We describe two approaches to discard these implicit assumptions and restrictions: one, that carefully generalizes prior work, leading to an optimal, but often-times intractable solution, and another, that provides a novel way of reasoning about filtering strategies, leading to a sometimes suboptimal, but efficiently computable solution (that is asymptotically close to optimal). We demonstrate that our techniques lead to significant reductions in error of up to 30\% for fixed cost over prior work in a novel crowdsourcing application: peer evaluation in online courses.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gruenheid:2014:IRL, author = "Anja Gruenheid and Xin Luna Dong and Divesh Srivastava", title = "Incremental record linkage", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "9", pages = "697--708", month = may, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:18 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Record linkage clusters records such that each cluster corresponds to a single distinct real-world entity. It is a crucial step in data cleaning and data integration. In the big data era, the velocity of data updates is often high, quickly making previous linkage results obsolete. This paper presents an end-to-end framework that can incrementally and efficiently update linkage results when data updates arrive. Our algorithms not only allow merging records in the updates with existing clusters, but also allow leveraging new evidence from the updates to fix previous linkage errors. Experimental results on three real and synthetic data sets show that our algorithms can significantly reduce linkage time without sacrificing linkage quality.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Roy:2014:LLH, author = "Pratanu Roy and Jens Teubner and Rainer Gemulla", title = "Low-latency handshake join", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "9", pages = "709--720", month = may, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:18 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This work revisits the processing of stream joins on modern hardware architectures. Our work is based on the recently proposed handshake join algorithm, which is a mechanism to parallelize the processing of stream joins in a NUMA-aware and hardware-friendly manner. Handshake join achieves high throughput and scalability, but it suffers from a high latency penalty and a non-deterministic ordering of the tuples in the physical result stream. In this paper, we first characterize the latency behavior of the handshake join and then propose a new low-latency handshake join algorithm, which substantially reduces latency without sacrificing throughput or scalability. We also present a technique to generate punctuated result streams with very little overhead; such punctuations allow the generation of correctly ordered physical output streams with negligible effect on overall throughput and latency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2014:PPT, author = "Huanhuan Wu and James Cheng and Silu Huang and Yiping Ke and Yi Lu and Yanyan Xu", title = "Path problems in temporal graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "9", pages = "721--732", month = may, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:18 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Shortest path is a fundamental graph problem with numerous applications. However, the concept of classic shortest path is insufficient or even flawed in a temporal graph, as the temporal information determines the order of activities along any path. In this paper, we show the shortcomings of classic shortest path in a temporal graph, and study various concepts of ``shortest'' path for temporal graphs. Computing these temporal paths is challenging as subpaths of a ``shortest'' path may not be ``shortest'' in a temporal graph. We investigate properties of the temporal paths and propose efficient algorithms to compute them. We tested our algorithms on real world temporal graphs to verify their efficiency, and also show that temporal paths are essential for studying temporal graphs by comparing shortest paths in normal static graphs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2014:RRI, author = "Xin Cao and Gao Cong and Christian S. Jensen and Man Lung Yiu", title = "Retrieving regions of interest for user exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "9", pages = "733--744", month = may, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:18 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We consider an application scenario where points of interest (PoIs) each have a web presence and where a web user wants to identify a region that contains relevant PoIs that are relevant to a set of keywords, e.g., in preparation for deciding where to go to conveniently explore the PoIs. Motivated by this, we propose the length-constrained maximum-sum region (LCMSR) query that returns a spatial-network region that is located within a general region of interest, that does not exceed a given size constraint, and that best matches query keywords. Such a query maximizes the total weight of the PoIs in it w.r.t. the query keywords. We show that it is NP-hard to answer this query. We develop an approximation algorithm with a (5 + \epsilon) approximation ratio utilizing a technique that scales node weights into integers. We also propose a more efficient heuristic algorithm and a greedy algorithm. Empirical studies on real data offer detailed insight into the accuracy of the proposed algorithms and show that the proposed algorithms are capable of computing results efficiently and effectively.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2014:SLE, author = "Yingfan Liu and Jiangtao Cui and Zi Huang and Hui Li and Heng Tao Shen", title = "{SK--LSH}: an efficient index structure for approximate nearest neighbor search", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "9", pages = "745--756", month = may, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:18 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Approximate Nearest Neighbor (ANN) search in high dimensional space has become a fundamental paradigm in many applications. Recently, Locality Sensitive Hashing (LSH) and its variants are acknowledged as the most promising solutions to ANN search. However, state-of-the-art LSH approaches suffer from a drawback: accesses to candidate objects require a large number of random I/O operations. In order to guarantee the quality of returned results, sufficient objects should be verified, which would consume enormous I/O cost. To address this issue, we propose a novel method, called SortingKeys-LSH (SK-LSH), which reduces the number of page accesses through locally arranging candidate objects. We firstly define a new measure to evaluate the distance between the compound hash keys of two points. A linear order relationship on the set of compound hash keys is then created, and the corresponding data points can be sorted accordingly. Hence, data points that are close to each other according to the distance measure can be stored locally in an index file. During the ANN search, only a limited number of disk pages among few index files are necessary to be accessed for sufficient candidate generation and verification, which not only significantly reduces the response time but also improves the accuracy of the returned results. Our exhaustive empirical study over several real-world data sets demonstrates the superior efficiency and accuracy of SK-LSH for the ANN search, compared with state-of-the-art methods, including LSB, C2LSH and CK-Means.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lin:2014:AFP, author = "Bing-Rong Lin and Daniel Kifer", title = "On arbitrage-free pricing for general data queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "9", pages = "757--768", month = may, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:18 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data is a commodity. Recent research has considered the mathematical problem of setting prices for different queries over data. Ideal pricing functions need to be flexible --defined for arbitrary queries (select-project-join, aggregate, random sample, and noisy privacy-preserving queries). They should be fine-grained --- a consumer should not be required to buy the entire database to get answers to simple ``low-information'' queries (such as selecting only a few tuples or aggregating over only one attribute). Similarly, a consumer may not want to pay a large amount of money, only to discover that the database is empty. Finally, pricing functions should satisfy consistency conditions such as being ``arbitrage-free'' --- consumers should not be able to circumvent the pricing function by deducing the answer to an expensive query from a few cheap queries. Previously proposed pricing functions satisfy some of these criteria (i.e. they are defined for restricted subclasses of queries and/or use relaxed conditions for avoiding arbitrage). In this paper, we study arbitrage-free pricing functions defined for arbitrary queries. We propose new necessary conditions for avoiding arbitrage and provide new arbitrage-free pricing functions. We also prove several negative results related to the tension between flexible pricing and avoiding arbitrage, and show how this tension often results in unreasonable prices.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2014:SMF, author = "Chao Zhang and Jiawei Han and Lidan Shou and Jiajun Lu and Thomas {La Porta}", title = "{Splitter}: mining fine-grained sequential patterns in semantic trajectories", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "9", pages = "769--780", month = may, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:18 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Driven by the advance of positioning technology and the popularity of location-sharing services, semantic-enriched trajectory data have become unprecedentedly available. The sequential patterns hidden in such data, when properly defined and extracted, can greatly benefit tasks like targeted advertising and urban planning. Unfortunately, classic sequential pattern mining algorithms developed for transactional data cannot effectively mine patterns in semantic trajectories, mainly because the places in the continuous space cannot be regarded as independent ``items''. Instead, similar places need to be grouped to collaboratively form frequent sequential patterns. That said, it remains a challenging task to mine what we call fine-grained sequential patterns, which must satisfy spatial compactness, semantic consistency and temporal continuity simultaneously. We propose Splitter to effectively mine such fine-grained sequential patterns in two steps. In the first step, it retrieves a set of spatially coarse patterns, each attached with a set of trajectory snippets that precisely record the pattern's occurrences in the database. In the second step, Splitter breaks each coarse pattern into fine-grained ones in a top-down manner, by progressively detecting dense and compact clusters in a higher-dimensional space spanned by the snippets. Splitter uses an effective algorithm called weighted snippet shift to detect such clusters, and leverages a divide-and-conquer strategy to speed up the top-down pattern splitting process. Our experiments on both real and synthetic data sets demonstrate the effectiveness and efficiency of Splitter.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Floratou:2014:TBW, author = "Avrilia Floratou and Frank Bertsch and Jignesh M. Patel and Georgios Laskaris", title = "Towards building wind tunnels for data center design", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "9", pages = "781--784", month = may, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:18 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data center design is a tedious and expensive process. Recently, this process has become even more challenging as users of cloud services expect to have guaranteed levels of availability, durability and performance. A new challenge for the service providers is to find the most cost-effective data center design and configuration that will accommodate the users' expectations, on ever-changing workloads, and constantly evolving hardware and software components. In this paper, we argue that data center design should become a systematic process. First, it should be done using an integrated approach that takes into account both the hardware and the software interdependencies, and their impact on users' expectations. Second, it should be performed in a ``wind tunnel'', which uses large-scale simulation to systematically explore the impact of a data center configuration on both the users' and the service providers' requirements. We believe that this is the first step towards systematic data center design --- an exciting area for future research.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2014:RRQ, author = "Zhao Zhang and Cheqing Jin and Qiangqiang Kang", title = "Reverse $k$-ranks query", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "10", pages = "785--796", month = jun, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:21 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Finding matching customers for a given product based on individual user's preference is critical for many applications, especially in e-commerce. Recently, the reverse top-$k$ query is proposed to return a number of customers who regard a given product as one of the $k$ most favorite products based on a linear model. Although a few ``hot'' products can be returned to some customers via reverse top-$k$ query, a large proportion of products (over 90\%, as our example illustrates, see Figure 2) cannot find any matching customers. Inspired by this observation, we propose a new kind of query ($R$-$k$ Ranks) which finds for a given product, the top-$k$ customers whose rank for the product is highest among all customers, to ensure 100\% coverage for any given product, no matter it is hot or niche. Not limited to e-commerce, the concept of customer --- product can be extended to a wider range of applications, such as dating and job-hunting. Unfortunately, existing approaches for reverse top-$k$ query cannot be used to handle $R$-$k$ Ranks conveniently due to infeasibility of getting enough elements for the query result. Hence, we propose three novel approaches to efficiently process $R$-$k$ Ranks query, including one tree-based method and two batch-pruning-based methods. Analysis of theoretical and experimental results on real and synthetic data sets illustrates the efficacy of the proposed methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jugel:2014:MVO, author = "Uwe Jugel and Zbigniew Jerzak and Gregor Hackenbroich and Gregor Hackenbroich and Volker Markl", title = "{M4}: a visualization-oriented time series data aggregation", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "10", pages = "797--808", month = jun, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:21 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Visual analysis of high-volume time series data is ubiquitous in many industries, including finance, banking, and discrete manufacturing. Contemporary, RDBMS-based systems for visualization of high-volume time series data have difficulty to cope with the hard latency requirements and high ingestion rates of interactive visualizations. Existing solutions for lowering the volume of time series data disregard the semantics of visualizations and result in visualization errors. In this work, we introduce M4, an aggregation-based time series dimensionality reduction technique that provides error-free visualizations at high data reduction rates. Focusing on line charts, as the predominant form of time series visualization, we explain in detail the drawbacks of existing data reduction techniques and how our approach outperforms state of the art, by respecting the process of line rasterization. We describe how to incorporate aggregation-based dimensionality reduction at the query level in a visualization-driven query rewriting system. Our approach is generic and applicable to any visualization system that uses an RDBMS as data source. Using real world data sets from high tech manufacturing, stock markets, and sports analytics domains we demonstrate that our visualization-oriented data aggregation can reduce data volumes by up to two orders of magnitude, while preserving perfect visualizations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ghashami:2014:CMA, author = "Mina Ghashami and Jeff M. Phillips and Feifei Li", title = "Continuous matrix approximation on distributed data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "10", pages = "809--820", month = jun, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:21 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Tracking and approximating data matrices in streaming fashion is a fundamental challenge. The problem requires more care and attention when data comes from multiple distributed sites, each receiving a stream of data. This paper considers the problem of ``tracking approximations to a matrix'' in the distributed streaming model. In this model, there are $m$ distributed sites each observing a distinct stream of data (where each element is a row of a distributed matrix) and has a communication channel with a coordinator, and the goal is to track an \epsilon -approximation to the norm of the matrix along any direction. To that end, we present novel algorithms to address the matrix approximation problem. Our algorithms maintain a smaller matrix $B$, as an approximation to a distributed streaming matrix $A$, such that for any unit vector $x$: $ | \, || A x ||^2 - || B x ||^2 | \leq \epsilon || A ||^2_F$. Our algorithms work in streaming fashion and incur small communication, which is critical for distributed computation. Our best method is deterministic and uses only $ O((m / \epsilon) \log (\beta N))$ communication, where $N$ is the size of stream (at the time of the query) and $ \beta $ is an upperbound on the squared norm of any row of the matrix. In addition to proving all algorithmic properties theoretically, extensive experiments with real large datasets demonstrate the efficiency of these protocols.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ren:2014:EAD, author = "Kun Ren and Alexander Thomson and Daniel J. Abadi", title = "An evaluation of the advantages and disadvantages of deterministic database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "10", pages = "821--832", month = jun, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:21 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent proposals for deterministic database system designs argue that deterministic database systems facilitate replication since the same input can be independently sent to two different replicas without concern for replica divergence. In addition, they argue that determinism yields performance benefits due to (1) the introduction of deadlock avoidance techniques, (2) the reduction (or elimination) of distributed commit protocols, and (3) light-weight locking. However, these performance benefits are not universally applicable, and there exist several disadvantages of determinism, including (1) the additional overhead of processing transactions for which it is not known in advance what data will be accessed, (2) an inability to abort transactions arbitrarily (e.g., in the case of database or partition overload), and (3) the increased latency required by a preprocessing layer that ensures that the same input is sent to every replica. This paper presents a thorough experimental study that carefully investigates both the advantages and disadvantages of determinism, in order to give a database user a more complete understanding of which database to use for a given database workload and cluster configuration.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2014:EMD, author = "Hao Zhang and Bogdan Marius Tudor and Gang Chen and Beng Chin Ooi", title = "Efficient in-memory data management: an analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "10", pages = "833--836", month = jun, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:21 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper analyzes the performance of three systems for in-memory data management: Memcached, Redis and the Resilient Distributed Datasets (RDD) implemented by Spark. By performing a thorough performance analysis of both analytics operations and fine-grained object operations such as set/get, we show that neither system handles efficiently both types of workloads. For Memcached and Redis the CPU and I/O performance of the TCP stack are the bottlenecks --- even when serving in-memory objects within a single server node. RDD does not support efficient get operation for random objects, due to a large startup cost of the get job. Our analysis reveals a set of features that a system must support in order to achieve efficient in-memory data management.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Aluc:2014:WMW, author = "G{\"u}nes Alu{\c{c}} and M. Tamer {\"O}zsu and Khuzaima Daudjee", title = "Workload matters: why {RDF} databases need a new design", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "10", pages = "837--840", month = jun, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:21 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Resource Description Framework (RDF) is a standard for conceptually describing data on the Web, and SPARQL is the query language for RDF. As RDF is becoming widely utilized, RDF data management systems are being exposed to more diverse and dynamic workloads. Existing systems are workload-oblivious, and are therefore unable to provide consistently good performance. We propose a vision for a workload-aware and adaptive system. To realize this vision, we re-evaluate relevant existing physical design criteria for RDF and address the resulting set of new challenges.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Alsubaiee:2014:SMA, author = "Sattam Alsubaiee and Alexander Behm and Vinayak Borkar and Zachary Heilbron and Young-Seok Kim and Michael J. Carey and Markus Dreseler and Chen Li", title = "Storage management in {AsterixDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "10", pages = "841--852", month = jun, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:21 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Social networks, online communities, mobile devices, and instant messaging applications generate complex, unstructured data at a high rate, resulting in large volumes of data. This poses new challenges for data management systems that aim to ingest, store, index, and analyze such data efficiently. In response, we released the first public version of AsterixDB, an open-source Big Data Management System (BDMS), in June of 2013. This paper describes the storage management layer of AsterixDB, providing a detailed description of its ingestion-oriented approach to local storage and a set of initial measurements of its ingestion-related performance characteristics. In order to support high frequency insertions, AsterixDB has wholly adopted Log-Structured Merge-trees as the storage technology for all of its index structures. We describe how the AsterixDB software framework enables ``LSM-ification'' (conversion from an in-place update, disk-based data structure to a deferred-update, append-only data structure) of any kind of index structure that supports certain primitive operations, enabling the index to ingest data efficiently. We also describe how AsterixDB ensures the ACID properties for operations involving multiple heterogeneous LSM-based indexes. Lastly, we highlight the challenges related to managing the resources of a system when many LSM indexes are used concurrently and present AsterixDB's initial solution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Klonatos:2014:BEQ, author = "Yannis Klonatos and Christoph Koch and Tiark Rompf and Hassan Chafi", title = "Building efficient query engines in a high-level language", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "10", pages = "853--864", month = jun, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:21 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", note = "See errata \cite{Klonatos:2014:EBE}.", abstract = "In this paper we advocate that it is time for a radical rethinking of database systems design. Developers should be able to leverage high-level programming languages without having to pay a price in efficiency. To realize our vision of abstraction without regret, we present LegoBase, a query engine written in the high-level programming language Scala. The key technique to regain efficiency is to apply generative programming: the Scala code that constitutes the query engine, despite its high-level appearance, is actually a program generator that emits specialized, low-level C code. We show how the combination of high-level and generative programming allows to easily implement a wide spectrum of optimizations that are difficult to achieve with existing low-level query compilers, and how it can continuously optimize the query engine. We evaluate our approach with the TPC-H benchmark and show that: (a) with all optimizations enabled, our architecture significantly outperforms a commercial in-memory database system as well as an existing query compiler, (b) these performance improvements require programming just a few hundred lines of high-level code instead of complicated low-level code that is required by existing query compilers and, finally, that (c) the compilation overhead is low compared to the overall execution time, thus making our approach usable in practice for efficiently compiling query engines.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2014:SLT, author = "Tianzheng Wang and Ryan Johnson", title = "Scalable logging through emerging non-volatile memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "10", pages = "865--876", month = jun, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:21 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Emerging byte-addressable, non-volatile memory (NVM) is fundamentally changing the design principle of transaction logging. It potentially invalidates the need for flush-before-commit as \log records are persistent immediately upon write. Distributed logging---a once prohibitive technique for single node systems in the DRAM era---becomes a promising solution to easing the logging bottleneck because of the non-volatility and high performance of NVM. In this paper, we advocate NVM and distributed logging on multicore and multi-socket hardware. We identify the challenges brought by distributed logging and discuss solutions. To protect committed work in NVM-based systems, we propose passive group commit, a lightweight, practical approach that leverages existing hardware and group commit. We expect that durable processor cache is the ultimate solution to protecting committed work and building reliable, scalable NVM-based systems in general. We evaluate distributed logging with logging-intensive workloads and show that distributed logging can achieve as much as $ \approx 3 \times $ speedup over centralized logging in a modern DBMS and that passive group commit only induces minuscule overhead.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{He:2014:WDM, author = "Bingsheng He", title = "When data management systems meet approximate hardware: challenges and opportunities", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "10", pages = "877--880", month = jun, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:21 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently, approximate hardware designs have got many research interests in the computer architecture community. The essential idea of approximate hardware is that the hardware components such as CPU, memory and storage can trade off the accuracy of results for increased performance, reduced energy consumption, or both. We propose a DBMS ApproxiDB with its design, implementation and optimization aware of the underlying approximate hardware. ApproxiDB will run on a hybrid machine consisting of both approximate hardware and precise hardware (i.e., the conventional hardware without sacrificing the accuracy). With approximate hardware, ApproxiDB can efficiently support the concept of approximate query processing, without the overhead of pre-computed synopses or sampling techniques. More importantly, ApproxiDB is also beneficial to precise query processing, by developing non-trivial hybrid execution mechanisms on both precise and approximate hardware. In this vision paper, we sketch the initial design of ApproxiDB, discuss the technical challenges in building this system and outline an agenda for future research.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dong:2014:DFK, author = "Xin Luna Dong and Evgeniy Gabrilovich and Geremy Heitz and Wilko Horn and Kevin Murphy and Shaohua Sun and Wei Zhang", title = "From data fusion to knowledge fusion", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "10", pages = "881--892", month = jun, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:21 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The task of data fusion is to identify the true values of data items (e.g., the true date of birth for Tom Cruise) among multiple observed values drawn from different sources (e.g., Web sites) of varying (and unknown) reliability. A recent survey [20] has provided a detailed comparison of various fusion methods on Deep Web data. In this paper, we study the applicability and limitations of different fusion techniques on a more challenging problem: knowledge fusion. Knowledge fusion identifies true subject-predicate-object triples extracted by multiple information extractors from multiple information sources. These extractors perform the tasks of entity linkage and schema alignment, thus introducing an additional source of noise that is quite different from that traditionally considered in the data fusion literature, which only focuses on factual errors in the original sources. We adapt state-of-the-art data fusion techniques and apply them to a knowledge base with 1.6B unique knowledge triples extracted by 12 extractors from over 1B Web pages, which is three orders of magnitude larger than the data sets used in previous data fusion papers. We show great promise of the data fusion approaches in solving the knowledge fusion problem, and suggest interesting research directions through a detailed error analysis of the methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Funke:2014:KPC, author = "Stefan Funke and Andr{\'e} Nusser and Sabine Storandt", title = "On $k$-path covers and their applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "10", pages = "893--902", month = jun, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:21 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "For a directed graph G with vertex set V we call a subset C \subseteq V a k-(All-)Path Cover if C contains a node from any path consisting of $k$ nodes. This paper considers the problem of constructing small $k$-Path Covers in the context of road networks with millions of nodes and edges. In many application scenarios the set C and its induced overlay graph constitute a very compact synopsis of G which is the basis for the currently fastest data structure for personalized shortest path queries, visually pleasing overlays of subsampled paths, and efficient reporting, retrieval and aggregation of associated data in spatial network databases. Apart from a theoretical investigation of the problem, we provide efficient algorithms that produce very small $k$-Path Covers for large real-world road networks (with a posteriori guarantees via instance-based lower bounds).", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2014:CDV, author = "Eugene Wu and Leilani Battle and Samuel R. Madden", title = "The case for data visualization management systems: vision paper", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "10", pages = "903--906", month = jun, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:21 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Most visualizations today are produced by retrieving data from a database and using a specialized visualization tool to render it. This decoupled approach results in significant duplication of functionality, such as aggregation and filters, and misses tremendous opportunities for cross-layer optimizations. In this paper, we present the case for an integrated Data Visualization Management System (DVMS) based on a declarative visualization language that fully compiles the end-to-end visualization pipeline into a set of relational algebra queries. Thus the DVMS can be both expressive via the visualization language, and performant by lever-aging traditional and visualization-specific optimizations to scale interactive visualizations to massive datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2014:WAA, author = "Yinan Li and Jignesh M. Patel", title = "{WideTable}: an accelerator for analytical data processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "10", pages = "907--918", month = jun, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:21 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents a technique called WideTable that aims to improve the speed of analytical data processing systems. A WideTable is built by denormalizing the database, and then converting complex queries into simple scans on the underlying (wide) table. To avoid the pitfalls associated with denormalization, e.g. space overheads, WideTable uses a combination of techniques including dictionary encoding and columnar storage. When denormalizing the data, WideTable uses outer joins to ensure that queries on tables in the schema graph, which are now nested as embedded tables in the WideTable, are processed correctly. Then, using a packed code scan technique, even complex queries on the original database can be answered by using simple scans on the WideTable(s). We experimentally evaluate our methods in a main memory setting using the queries in TPC-H, and demonstrate the effectiveness of our methods, both in terms of raw query performance and scalability when running on many-core machines.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{To:2014:FPW, author = "Hien To and Gabriel Ghinita and Cyrus Shahabi", title = "A framework for protecting worker location privacy in spatial crowdsourcing", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "10", pages = "919--930", month = jun, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:21 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Spatial Crowdsourcing (SC) is a transformative platform that engages individuals, groups and communities in the act of collecting, analyzing, and disseminating environmental, social and other spatio-temporal information. The objective of SC is to outsource a set of spatio-temporal tasks to a set of workers, i.e., individuals with mobile devices that perform the tasks by physically traveling to specified locations of interest. However, current solutions require the workers, who in many cases are simply volunteering for a cause, to disclose their locations to untrustworthy entities. In this paper, we introduce a framework for protecting location privacy of workers participating in SC tasks. We argue that existing location privacy techniques are not sufficient for SC, and we propose a mechanism based on differential privacy and geocasting that achieves effective SC services while offering privacy guarantees to workers. We investigate analytical models and task assignment strategies that balance multiple crucial aspects of SC functionality, such as task completion rate, worker travel distance and system overhead. Extensive experimental results on real-world datasets show that the proposed technique protects workers' location privacy without incurring significant performance metrics penalties.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Eldawy:2014:TTS, author = "Ahmed Eldawy and Justin Levandoski and Per-{\AA}ke Larson", title = "Trekking through {Siberia}: managing cold data in a memory-optimized database", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "11", pages = "931--942", month = jul, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:24 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Main memories are becoming sufficiently large that most OLTP databases can be stored entirely in main memory, but this may not be the best solution. OLTP workloads typically exhibit skewed access patterns where some records are hot (frequently accessed) but many records are cold (infrequently or never accessed). It is still more economical to store the coldest records on secondary storage such as flash. This paper introduces Siberia, a framework for managing cold data in the Microsoft Hekaton main-memory database engine. We discuss how to migrate cold data to secondary storage while providing an interface to the user to manipulate both hot and cold data that hides the actual data location. We describe how queries of different isolation levels can read and modify data stored in both hot and cold stores without restriction while minimizing number of accesses to cold storage. We also show how records can be migrated between hot and cold stores while the DBMS is online and active. Experiments reveal that for cold data access rates appropriate for main-memory optimized databases, we incur an acceptable 7-14\% throughput loss.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Duggan:2014:CPD, author = "Jennie Duggan", title = "The case for personal data-driven decision making", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "11", pages = "943--946", month = jul, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:24 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data-driven decision making (D3M) has shown great promise in professional pursuits such as business and government. Here, policymakers collect and analyze data to make their operations more efficient and equitable. Progress in bringing the benefits of D3M to everyday life has been slow. For example, a student asks, ``If I pursue an undergraduate degree at this university, what are my expected lifetime earnings?''. Presently there is no principled way to search for this, because an accurate answer depends on the student and school. Such queries are personalized, winnowing down large datasets for specific circumstances, rather than applying well-defined predicates. They predict decision outcomes by extrapolating from relevant examples. This vision paper introduces a new approach to D3M that is designed to empower the individual to make informed choices. Here, we highlight research opportunities for the data management community arising from this proposal.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chairunnanda:2014:CMM, author = "Prima Chairunnanda and Khuzaima Daudjee and M. Tamer {\"O}zsu", title = "{ConfluxDB}: multi-master replication for partitioned snapshot isolation databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "11", pages = "947--958", month = jul, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:24 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Lazy replication with snapshot isolation (SI) has emerged as a popular choice for distributed databases. However, lazy replication often requires execution of update transactions at one (master) site so that it is relatively easy for a total SI order to be determined for consistent installation of updates in the lazily replicated system. We propose a set of techniques that support update transaction execution over multiple partitioned sites, thereby allowing the master to scale. Our techniques determine a total SI order for update transactions over multiple master sites without requiring global coordination in the distributed system, and ensure that updates are installed in this order at all sites to provide consistent and scalable replication with SI. We present ConfluxDB, a PostgreSQL-based implementation of our techniques, and demonstrate its effectiveness through experimental evaluation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Goncalves:2014:DMS, author = "Bernardo Gon{\c{c}}alves and Fabio Porto", title = "{$ \gamma $-DB}: managing scientific hypotheses as uncertain data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "11", pages = "959--962", month = jul, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:24 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In view of the paradigm shift that makes science ever more data-driven, we consider deterministic scientific hypotheses as uncertain data. This vision comprises a probabilistic database (p-DB) design methodology for the systematic construction and management of U-relational hypothesis DBs, viz., $ \gamma $-DBs. It introduces hypothesis management as a promising new class of applications for p-DBs. We illustrate the potential of $ \gamma $-DB as a tool for deep predictive analytics.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Woods:2014:IIS, author = "Louis Woods and Zsolt Istv{\'a}n and Gustavo Alonso", title = "{Ibex}: an intelligent storage engine with support for advanced {SQL} offloading", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "11", pages = "963--974", month = jul, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:24 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern data appliances face severe bandwidth bottlenecks when moving vast amounts of data from storage to the query processing nodes. A possible solution to mitigate these bottlenecks is query off-loading to an intelligent storage engine, where partial or whole queries are pushed down to the storage engine. In this paper, we present Ibex, a prototype of an intelligent storage engine that supports off-loading of complex query operators. Besides increasing performance, Ibex also reduces energy consumption, as it uses an FPGA rather than conventional CPUs to implement the off-load engine. Ibex is a hybrid engine, with dedicated hardware that evaluates SQL expressions at line-rate and a software fallback for tasks that the hardware engine cannot handle. Ibex supports GROUP BY aggregation, as well as projection --- and selection --- based filtering. GROUP BY aggregation has a higher impact on performance but is also a more challenging operator to implement on an FPGA.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yun:2014:NNL, author = "Hyokun Yun and Hsiang-Fu Yu and Cho-Jui Hsieh and S. V. N. Vishwanathan and Inderjit Dhillon", title = "{NOMAD}: non-locking, stochastic multi-machine algorithm for asynchronous and decentralized matrix completion", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "11", pages = "975--986", month = jul, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:24 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We develop an efficient parallel distributed algorithm for matrix completion, named NOMAD (Non-locking, stOchastic Multi-machine algorithm for Asynchronous and Decentralized matrix completion). NOMAD is a decentralized algorithm with non-blocking communication between processors. One of the key features of NOMAD is that the ownership of a variable is asynchronously transferred between processors in a decentralized fashion. As a consequence it is a lock-free parallel algorithm. In spite of being asynchronous, the variable updates of NOMAD are serializable, that is, there is an equivalent update ordering in a serial implementation. NOMAD outperforms synchronous algorithms which require explicit bulk synchronization after every iteration: our extensive empirical evaluation shows that not only does our algorithm perform well in distributed setting on commodity hardware, but also outperforms state-of-the-art algorithms on a HPC cluster both in multi-core and distributed memory settings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Song:2014:RVL, author = "Shaoxu Song and Hong Cheng and Jeffrey Xu Yu and Lei Chen", title = "Repairing vertex labels under neighborhood constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "11", pages = "987--998", month = jul, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:24 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A broad class of data, ranging from similarity networks, workflow networks to protein networks, can be modeled as graphs with data values as vertex labels. The vertex labels (data values) are often dirty for various reasons such as typos or erroneous reporting of results in scientific experiments. Neighborhood constraints, specifying label pairs that are allowed to appear on adjacent vertexes in the graph, are employed to detect and repair erroneous vertex labels. In this paper, we study the problem of repairing vertex labels to make graphs satisfy neighborhood constraints. Unfortunately, the relabeling problem is proved to be NP hard, which motivates us to devise approximation methods for repairing, and identify interesting special cases (star and clique constraints) that can be efficiently solved. We propose several approximate repairing algorithms including greedy heuristics, contraction method and a hybrid approach. The performances of algorithms are also analyzed for the special case. Our extensive experimental evaluation, on both synthetic and real data, demonstrates the effectiveness of eliminating frauds in several types of application networks. Remarkably, the hybrid method performs well in practice, i.e., guarantees termination, while achieving high effectiveness at the same time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Altowim:2014:PAR, author = "Yasser Altowim and Dmitri V. Kalashnikov and Sharad Mehrotra", title = "Progressive approach to relational entity resolution", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "11", pages = "999--1010", month = jul, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:24 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper proposes a progressive approach to entity resolution (ER) that allows users to explore a trade-off between the resolution cost and the achieved quality of the resolved data. In particular, our approach aims to produce the highest quality result given a constraint on the resolution budget, specified by the user. Our proposed method monitors and dynamically reassesses the resolution progress to determine which parts of the data should be resolved next and how they should be resolved. The comprehensive empirical evaluation of the proposed approach demonstrates its significant advantage in terms of efficiency over the traditional ER techniques for the given problem settings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2014:CAQ, author = "Kaibo Wang and Kai Zhang and Yuan Yuan and Siyuan Ma and Rubao Lee and Xiaoning Ding and Xiaodong Zhang", title = "Concurrent analytical query processing with {GPUs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "11", pages = "1011--1022", month = jul, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:24 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In current databases, GPUs are used as dedicated accelerators to process each individual query. Sharing GPUs among concurrent queries is not supported, causing serious resource underutilization. Based on the profiling of an open-source GPU query engine running commonly used single-query data warehousing workloads, we observe that the utilization of main GPU resources is only up to 25\%. The underutilization leads to low system throughput. To address the problem, this paper proposes concurrent query execution as an effective solution. To efficiently share GPUs among concurrent queries for high throughput, the major challenge is to provide software support to control and resolve resource contention incurred by the sharing. Our solution relies on GPU query scheduling and device memory swapping policies to address this challenge. We have implemented a prototype system and evaluated it intensively. The experiment results confirm the effectiveness and performance advantage of our approach. By executing multiple GPU queries concurrently, system throughput can be improved by up to 55\% compared with dedicated processing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Maehara:2014:CPP, author = "Takanori Maehara and Takuya Akiba and Yoichi Iwata and Ken-ichi Kawarabayashi", title = "Computing personalized {PageRank} quickly by exploiting graph structures", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1023--1034", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/pagerank.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose a new scalable algorithm that can compute Personalized PageRank (PPR) very quickly. The Power method is a state-of-the-art algorithm for computing exact PPR; however, it requires many iterations. Thus reducing the number of iterations is the main challenge. We achieve this by exploiting graph structures of web graphs and social networks. The convergence of our algorithm is very fast. In fact, it requires up to 7.5 times fewer iterations than the Power method and is up to five times faster in actual computation time. To the best of our knowledge, this is the first time to use graph structures explicitly to solve PPR quickly. Our contributions can be summarized as follows. 1. We provide an algorithm for computing a tree decomposition, which is more efficient and scalable than any previous algorithm. 2. Using the above algorithm, we can obtain a core-tree decomposition of any web graph and social network. This allows us to decompose a web graph and a social network into (1) the core, which behaves like an expander graph, and (2) a small tree-width graph, which behaves like a tree in an algorithmic sense. 3. We apply a direct method to the small tree-width graph to construct an LU decomposition. 4. Building on the LU decomposition and using it as pre-conditioner, we apply GMRES method (a state-of-the-art advanced iterative method) to compute PPR for whole web graphs and social networks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Serafini:2014:AES, author = "Marco Serafini and Essam Mansour and Ashraf Aboulnaga and Kenneth Salem and Taha Rafiq and Umar Farooq Minhas", title = "{Accordion}: elastic scalability for database systems supporting distributed transactions", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1035--1046", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Providing the ability to elastically use more or fewer servers on demand (scale out and scale in) as the load varies is essential for database management systems (DBMSes) deployed on today's distributed computing platforms, such as the cloud. This requires solving the problem of dynamic (online) data placement, which has so far been addressed only for workloads where all transactions are local to one sever. In DBMSes where ACID transactions can access more than one partition, distributed transactions represent a major performance bottleneck. Scaling out and spreading data across a larger number of servers does not necessarily result in a linear increase in the overall system throughput, because transactions that used to access only one server may become distributed. In this paper we present Accordion, a dynamic data placement system for partition-based DBMSes that support ACID transactions (local or distributed). It does so by explicitly considering the affinity between partitions, which indicates the frequency in which they are accessed together by the same transactions. Accordion estimates the capacity of a server by explicitly considering the impact of distributed transactions and affinity on the maximum throughput of the server. It then integrates this estimation in a mixed-integer linear program to explore the space of possible configurations and decide whether to scale out. We implemented Accordion and evaluated it using H-Store, a shared-nothing in-memory DBMS. Our results using the TPC-C and YCSB benchmarks show that Accordion achieves benefits compared to alternative heuristics of up to an order of magnitude reduction in the number of servers used and in the amount of data migrated.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Han:2014:ECP, author = "Minyang Han and Khuzaima Daudjee and Khaled Ammar and M. Tamer {\"O}zsu and Xingfang Wang and Tianqi Jin", title = "An experimental comparison of Pregel-like graph processing systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1047--1058", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The introduction of Google's Pregel generated much interest in the field of large-scale graph data processing, inspiring the development of Pregel-like systems such as Apache Giraph, GPS, Mizan, and GraphLab, all of which have appeared in the past two years. To gain an understanding of how Pregel-like systems perform, we conduct a study to experimentally compare Giraph, GPS, Mizan, and GraphLab on equal ground by considering graph and algorithm agnostic optimizations and by using several metrics. The systems are compared with four different algorithms (PageRank, single source shortest path, weakly connected components, and distributed minimum spanning tree) on up to 128 Amazon EC2 machines. We find that the system optimizations present in Giraph and GraphLab allow them to perform well. Our evaluation also shows Giraph 1.0.0's considerable improvement since Giraph 0.1 and identifies areas of improvement for all systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sarma:2014:CSJ, author = "Akash {Das Sarma} and Yeye He and Surajit Chaudhuri", title = "{ClusterJoin}: a similarity joins framework using map-reduce", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1059--1070", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Similarity join is the problem of finding pairs of records with similarity score greater than some threshold. In this paper we study the problem of scaling up similarity join for different metric distance functions using MapReduce. We propose a ClusterJoin framework that partitions the data space based on the underlying data distribution, and distributes each record to partitions in which they may produce join results based on the distance threshold. We design a set of strong candidate filters specific to different distance functions using a novel bisector-based framework, so that each record only needs to be distributed to a small number of partitions while still guaranteeing correctness. To address data skewness, which is common for high dimensional data, we further develop a dynamic load balancing scheme using sampling, which provides strong probabilistic guarantees on the size of partitions, and greatly improves scalability. Experimental evaluation using real data sets shows that our approach is considerably more scalable compared to state-of-the-art algorithms, especially for high dimensional data with low distance thresholds.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Vesdapunt:2014:CAE, author = "Norases Vesdapunt and Kedar Bellare and Nilesh Dalvi", title = "Crowdsourcing algorithms for entity resolution", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1071--1082", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we study a hybrid human-machine approach for solving the problem of Entity Resolution (ER). The goal of ER is to identify all records in a database that refer to the same underlying entity, and are therefore duplicates of each other. Our input is a graph over all the records in a database, where each edge has a probability denoting our prior belief (based on Machine Learning models) that the pair of records represented by the given edge are duplicates. Our objective is to resolve all the duplicates by asking humans to verify the equality of a subset of edges, leveraging the transitivity of the equality relation to infer the remaining edges (e.g. $ a = c $ can be inferred given $ a = b $ and $ b = c$). We consider the problem of designing optimal strategies for asking questions to humans that minimize the expected number of questions asked. Using our theoretical framework, we analyze several strategies, and show that a strategy, claimed as ``optimal'' for this problem in a recent work, can perform arbitrarily bad in theory. We propose alternate strategies with theoretical guarantees. Using both public datasets as well as the production system at Facebook, we show that our techniques are effective in practice.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2014:DGS, author = "Wenfei Fan and Xin Wang and Yinghui Wu and Dong Deng", title = "Distributed graph simulation: impossibility and possibility", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1083--1094", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper studies fundamental problems for distributed graph simulation. Given a pattern query Q and a graph G that is fragmented and distributed, a graph simulation algorithm A is to compute the matches Q (G) of Q in G. We say that A is parallel scalable in (a) response time if its parallel computational cost is determined by the largest fragment F$_m$ of G and the size | Q | of query Q, and (b) data shipment if its total amount of data shipped is determined by | Q | and the number of fragments of G, independent of the size of graph G. (1) We prove an impossibility theorem: there exists no distributed graph simulation algorithm that is parallel scalable in either response time or data shipment. (2) However, we show that distributed graph simulation is partition bounded, i.e., its response time depends only on | Q |, | F$_m$ | and the number | V$_f$ | of nodes in G with edges across different fragments; and its data shipment depends on | Q | and the number | E$_f$ | of crossing edges only. We provide the first algorithms with these performance guarantees. (3) We also identify special cases of patterns and graphs when parallel scalability is possible. (4) We experimentally verify the scalability and efficiency of our algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nagel:2014:CGE, author = "Fabian Nagel and Gavin Bierman and Stratis D. Viglas", title = "Code generation for efficient query processing in managed runtimes", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1095--1106", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper we examine opportunities arising from the convergence of two trends in data management: in-memory database systems (imdbs), which have received renewed attention following the availability of affordable, very large main memory systems; and language-integrated query, which transparently integrates database queries with programming languages (thus addressing the famous 'impedance mismatch' problem). Language-integrated query not only gives application developers a more convenient way to query external data sources like imdbs, but also to use the same querying language to query an application's in-memory collections. The latter offers further transparency to developers as the query language and all data is represented in the data model of the host programming language. However, compared to imdbs, this additional freedom comes at a higher cost for query evaluation. Our vision is to improve in-memory query processing of application objects by introducing database technologies to managed runtimes. We focus on querying and we leverage query compilation to improve query processing on application objects. We explore different query compilation strategies and study how they improve the performance of query processing over application data. We take C\# as the host programming language as it supports language-integrated query through the linq framework. Our techniques deliver significant performance improvements over the default linq implementation. Our work makes important first steps towards a future where data processing applications will commonly run on machines that can store their entire datasets in-memory, and will be written in a single programming language employing language-integrated query and imdb-inspired runtimes to provide transparent and highly efficient querying.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2014:AED, author = "Weimo Liu and Saravanan Thirumuruganathan and Nan Zhang and Gautam Das", title = "Aggregate estimation over dynamic hidden web databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1107--1118", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many databases on the web are ``hidden'' behind (i.e., accessible only through) their restrictive, form-like, search interfaces. Recent studies have shown that it is possible to estimate aggregate query answers over such hidden web databases by issuing a small number of carefully designed search queries through the restrictive web interface. A problem with these existing work, however, is that they all assume the underlying database to be static, while most real-world web databases (e.g., Amazon, eBay) are frequently updated. In this paper, we study the novel problem of estimating/tracking aggregates over dynamic hidden web databases while adhering to the stringent query-cost limitation they enforce (e.g., at most 1,000 search queries per day). Theoretical analysis and extensive real-world experiments demonstrate the effectiveness of our proposed algorithms and their superiority over baseline solutions (e.g., the repeated execution of algorithms designed for static web databases).", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Karpathiotakis:2014:AQP, author = "Manos Karpathiotakis and Miguel Branco and Ioannis Alagiannis and Anastasia Ailamaki", title = "Adaptive query processing on {RAW} data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1119--1130", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database systems deliver impressive performance for large classes of workloads as the result of decades of research into optimizing database engines. High performance, however, is achieved at the cost of versatility. In particular, database systems only operate efficiently over loaded data, i.e., data converted from its original raw format into the system's internal data format. At the same time, data volume continues to increase exponentially and data varies increasingly, with an escalating number of new formats. The consequence is a growing impedance mismatch between the original structures holding the data in the raw files and the structures used by query engines for efficient processing. In an ideal scenario, the query engine would seamlessly adapt itself to the data and ensure efficient query processing regardless of the input data formats, optimizing itself to each instance of a file and of a query by leveraging information available at query time. Today's systems, however, force data to adapt to the query engine during data loading. This paper proposes adapting the query engine to the formats of raw data. It presents RAW, a prototype query engine which enables querying heterogeneous data sources transparently. RAW employs Just-In-Time access paths, which efficiently couple heterogeneous raw files to the query engine and reduce the overheads of traditional general-purpose scan operators. There are, however, inherent overheads with accessing raw data directly that cannot be eliminated, such as converting the raw values. Therefore, RAW also uses column shreds, ensuring that we pay these costs only for the subsets of raw data strictly needed by a query. We use RAW in a real-world scenario and achieve a two-order of magnitude speedup against the existing hand-written solution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Afrati:2014:SQT, author = "Foto N. Afrati and Dan Delorey and Mosha Pasumansky and Jeffrey D. Ullman", title = "Storing and querying tree-structured records in {Dremel}", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1131--1142", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In Dremel, data is stored as nested relations. The schema for a relation is a tree, all of whose nodes are attributes, and whose leaf attributes hold values. We explore filter and aggregate queries that are given in the Dremel dialect of SQL. Complications arise because of repeated attributes, i.e., attributes that are allowed to have more than one value. We focus on the common class of Dremel queries that are processed on column-stored data in a way that results in query processing time that is linear on the size of the relevant data, i.e., data in the columns that participate in the query. We formally define the data model, the query language and the algorithms for query processing in column-stored data. The concepts of repetition context and semi-flattening are introduced here and play a central role in understanding this class of queries and their algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Starlinger:2014:SSS, author = "Johannes Starlinger and Bryan Brancotte and Sarah Cohen-Boulakia and Ulf Leser", title = "Similarity search for scientific workflows", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1143--1154", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the increasing popularity of scientific workflows, public repositories are gaining importance as a means to share, find, and reuse such workflows. As the sizes of these repositories grow, methods to compare the scientific workflows stored in them become a necessity, for instance, to allow duplicate detection or similarity search. Scientific workflows are complex objects, and their comparison entails a number of distinct steps from comparing atomic elements to comparison of the workflows as a whole. Various studies have implemented methods for scientific workflow comparison and came up with often contradicting conclusions upon which algorithms work best. Comparing these results is cumbersome, as the original studies mixed different approaches for different steps and used different evaluation data and metrics. We contribute to the field (i) by dissecting each previous approach into an explicitly defined and comparable set of subtasks, (ii) by comparing in isolation different approaches taken at each step of scientific workflow comparison, reporting on an number of unexpected findings, (iii) by investigating how these can best be combined into aggregated measures, and (iv) by making available a gold standard of over 2000 similarity ratings contributed by 15 workflow experts on a corpus of almost 1500 workflows and re-implementations of all methods we evaluated.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kellaris:2014:DPE, author = "Georgios Kellaris and Stavros Papadopoulos and Xiaokui Xiao and Dimitris Papadias", title = "Differentially private event sequences over infinite streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1155--1166", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Numerous applications require continuous publication of statistics or monitoring purposes, such as real-time traffic analysis, timely disease outbreak discovery, and social trends observation. These statistics may be derived from sensitive user data and, hence, necessitate privacy preservation. A notable paradigm for offering strong privacy guarantees in statistics publishing is \epsilon -differential privacy. However, there is limited literature that adapts this concept to settings where the statistics are computed over an infinite stream of ``events'' (i.e., data items generated by the users), and published periodically. These works aim at hiding a single event over the entire stream. We argue that, in most practical scenarios, sensitive information is revealed from multiple events occurring at contiguous time instances. Towards this end, we put forth the novel notion of $w$ --- event privacy over infinite streams, which protects any event sequence occurring in $w$ successive time instants. We first formulate our privacy concept, motivate its importance, and introduce a methodology for achieving it. We next design two instantiations, whose utility is independent of the stream length. Finally, we confirm the practicality of our solutions experimenting with real data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Londhe:2014:MTC, author = "Nikhil Londhe and Vishrawas Gopalakrishnan and Aidong Zhang and Hung Q. Ngo and Rohini Srihari", title = "Matching titles with cross title web-search enrichment and community detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1167--1178", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Title matching refers roughly to the following problem. We are given two strings of text obtained from different data sources. The texts refer to some underlying physical entities and the problem is to report whether the two strings refer to the same physical entity or not. There are manifestations of this problem in a variety of domains, such as product or bibliography matching, and location or person disambiguation. We propose a new approach to solving this problem, consisting of two main components. The first component uses Web searches to ``enrich'' the given pair of titles: making titles that refer to the same physical entity more similar, and those which do not, much less similar. A notion of similarity is then measured using the second component, where the tokens from the two titles are modelled as vertices of a ``social'' network graph. A ``strength of ties'' style of clustering algorithm is then applied on this to see whether they form one cohesive ``community'' (matching titles), or separately clustered communities (mismatching titles). Experimental results confirm the effectiveness of our approach over existing title matching methods across several input domains.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Song:2014:CSR, author = "Shaoxu Song and Lei Chen and Hong Cheng", title = "On concise set of relative candidate keys", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1179--1190", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Matching keys, specifying what attributes to compare and how to compare them for identifying the same real-world entities, are found to be useful in applications like record matching, blocking and windowing [7]. Owing to the complex redundant semantics among matching keys, capturing a proper set of matching keys is highly non-trivial. Analogous to minimal/candidate keys w.r.t. functional dependencies, relative candidate keys (RCKs [7], with a minimal number of compared attributes, see a more formal definition in Section 2) can clear up redundant semantics w.r.t. ``what attributes to compare''. However, we note that redundancy issues may still exist among rcks on the same attributes about ``how to compare them''. In this paper, we propose to find a concise set of matching keys, which has less redundancy and can still meet the requirements on coverage and validity. Specifically, we study approximation algorithms to efficiently discover a near optimal set. To ensure the quality of matching keys, the returned results are guaranteed to be RCKs (minimal on compared attributes), and most importantly, minimal w.r.t. distance restrictions (i.e., redundancy free w.r.t. ``how to compare the attributes''). The experimental evaluation demonstrates that our concise RCK set is more effective than the existing rck choosing method. Moreover, the proposed pruning methods show up to 2 orders of magnitude improvement w.r.t. time costs on concise RCK set discovery.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wei:2014:RQI, author = "Hao Wei and Jeffrey Xu Yu and Can Lu and Ruoming Jin", title = "Reachability querying: an independent permutation labeling approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1191--1202", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Reachability query is a fundamental graph operation which answers whether a vertex can reach another vertex over a large directed graph G with $n$ vertices and $m$ edges, and has been extensively studied. In the literature, all the approaches compute a label for every vertex in a graph G by index construction offline. The query time for answering reachability queries online is affected by the quality of the labels computed in index construction. The three main costs are the index construction time, the index size, and the query time. Some of the up-to-date approaches can answer reachability queries efficiently, but spend non-linear time to construct an index. Some of the up-to-date approaches construct an index in linear time and space, but may need to depth-first search G at run-time in $ O(n + m)$. In this paper, as the first, we propose a new randomized labeling approach to answer reachability queries, and the randomness is by independent permutation. We conduct extensive experimental studies to compare with the up-to-date approaches using 19 large real datasets used in the existing work and synthetic datasets. We confirm the efficiency of our approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jiang:2014:HDL, author = "Minhao Jiang and Ada Wai-Chee Fu and Raymond Chi-Wing Wong and Yanyan Xu", title = "Hop doubling label indexing for point-to-point distance querying on scale-free networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1203--1214", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the problem of point-to-point distance querying for massive scale-free graphs, which is important for numerous applications. Given a directed or undirected graph, we propose to build an index for answering such queries based on a novel hop-doubling labeling technique. We derive bounds on the index size, the computation costs and I/O costs based on the properties of unweighted scale-free graphs. We show that our method is much more efficient and effective compared to the state-of-the-art techniques, in terms of both querying time and indexing costs. Our empirical study shows that our method can handle graphs that are orders of magnitude larger than existing methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Suchanek:2014:SC, author = "Fabian M. Suchanek and Nicoleta Preda", title = "Semantic culturomics", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1215--1218", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Newspapers are testimonials of history. The same is increasingly true of social media such as online forums, online communities, and blogs. By looking at the sequence of articles over time, one can discover the birth and the development of trends that marked society and history --- a field known as ``Culturomics''. But Culturomics has so far been limited to statistics on keywords. In this vision paper, we argue that the advent of large knowledge bases (such as YAGO [37], NELL [5], DBpedia [3], and Freebase) will revolutionize the field. If their knowledge is combined with the news articles, it can breathe life into what is otherwise just a sequence of words for a machine. This will allow discovering trends in history and culture, explaining them through explicit logical rules, and making predictions about the events of the future. We predict that this could open up a new field of research, ``Semantic Culturomics'', in which no longer human text helps machines build up knowledge bases, but knowledge bases help humans understand their society.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kuhlenkamp:2014:BSE, author = "J{\"o}rn Kuhlenkamp and Markus Klems and Oliver R{\"o}ss", title = "Benchmarking scalability and elasticity of distributed database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1219--1230", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed database system performance benchmarks are an important source of information for decision makers who must select the right technology for their data management problems. Since important decisions rely on trustworthy experimental data, it is necessary to reproduce experiments and verify the results. We reproduce performance and scalability benchmarking experiments of HBase and Cassandra that have been conducted by previous research and compare the results. The scope of our reproduced experiments is extended with a performance evaluation of Cassandra on different Amazon EC2 infrastructure configurations, and an evaluation of Cassandra and HBase elasticity by measuring scaling speed and performance impact while scaling.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2014:BCQ, author = "Yang Cao and Wenfei Fan and Tianyu Wo and Wenyuan Yu", title = "Bounded conjunctive queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1231--1242", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A query Q is said to be effectively bounded if for all datasets D, there exists a subset D$_Q$ of D such that Q (D) = Q (D$_Q$), and the size of DQ and time for fetching D$_Q$ are independent of the size of D. The need for studying such queries is evident, since it allows us to compute Q (D) by accessing a bounded dataset D$_Q$, regardless of how big D is. This paper investigates effectively bounded conjunctive queries (SPC) under an access schema A, which specifies indices and cardinality constraints commonly used. We provide characterizations (sufficient and necessary conditions) for determining whether an SPC query Q is effectively bounded under A. We study several problems for deciding whether Q is bounded, and if not, for identifying a minimum set of parameters of Q to instantiate and make Q bounded. We show that these problems range from quadratic-time to NP-complete, and develop efficient (heuristic) algorithms for them. We also provide an algorithm that, given an effectively bounded SPC query Q and an access schema A, generates a query plan for evaluating Q by accessing a bounded amount of data in any (possibly big) dataset. We experimentally verify that our algorithms substantially reduce the cost of query evaluation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shanbhag:2014:OJE, author = "Anil Shanbhag and S. Sudarshan", title = "Optimizing join enumeration in transformation-based query optimizers", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1243--1254", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query optimizers built on the Volcano/Cascades framework, which is based on transformation rules, are used in many commercial databases. Transformation rulesets proposed earlier for join order enumeration in such a framework either allow enumeration of joins with cross-products (which can significantly increase the cost of optimization), or generate a large number of duplicate derivations. In this paper we propose two new rulesets for generating cross-product free trees. One of the rulesets is a minor extension of a simple but inefficient ruleset, which we prove is complete (we also show that a naive extension of an efficient ruleset leads to incompleteness). We then propose an efficient new ruleset, which is based on techniques proposed recently for top-down join order enumeration, but unlike earlier work it is cleanly integrated into the Volcano/Cascades framework, and can be used in conjunction with other transformation rules. We show that our ruleset is complete (i.e., it generates the entire search space without cross products) while avoiding inefficiency due to duplicate derivations. We have implemented this ruleset in the PyroJ Optimizer (an implementation of the Volcano optimizer framework) and show that it significantly outperforms the alternatives, in some cases by up to two orders of magnitude, in terms of time taken.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jacob:2014:SMA, author = "Marie Jacob and Benny Kimelfeld and Julia Stoyanovich", title = "A system for management and analysis of preference data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1255--1258", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Preference data arises in a wide variety of domains. Over the past decade, we have seen a sharp increase in the volume of preference data, in the diversity of applications that use it, and in the richness of preference data analysis methods. Examples of applications include rank aggregation in genomic data analysis, management of votes in elections, and recommendation systems in e-commerce. However, little attention has been paid to the challenges of building a system for preference-data management, which would help incorporate sophisticated analytics into larger applications, support computational abstractions for usability by data scientists, and enable scaling up to modern volumes. This vision paper proposes a management system for preference data that aims to address these challenges. We adopt the relational database model, and propose extensions that are specialized to handling preference data. Specifically, we introduce a special type of a relation that is designed for preference data, and describe composable operators on preference relations that can be embedded in SQL statements, for convenient reuse across applications.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gupta:2014:MGR, author = "Ashish Gupta and Fan Yang and Jason Govig and Adam Kirsch and Kelvin Chan and Kevin Lai and Shuo Wu and Sandeep Govind Dhoot and Abhilash Rajesh Kumar and Ankur Agiwal and Sanjay Bhansali and Mingsheng Hong and Jamie Cameron and Masood Siddiqi and David Jones and Jeff Shute and Andrey Gubarev and Shivakumar Venkataraman and Divyakant Agrawal", title = "{Mesa}: geo-replicated, near real-time, scalable data warehousing", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1259--1270", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Mesa is a highly scalable analytic data warehousing system that stores critical measurement data related to Google's Internet advertising business. Mesa is designed to satisfy a complex and challenging set of user and systems requirements, including near real-time data ingestion and queryability, as well as high availability, reliability, fault tolerance, and scalability for large data and query volumes. Specifically, Mesa handles petabytes of data, processes millions of row updates per second, and serves billions of queries that fetch trillions of rows per day. Mesa is geo-replicated across multiple datacenters and provides consistent and repeatable query answers at low latency, even when an entire datacenter fails. This paper presents the Mesa system and reports the performance and scale that it achieves.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liagouris:2014:EES, author = "John Liagouris and Nikos Mamoulis and Panagiotis Bouros and Manolis Terrovitis", title = "An effective encoding scheme for spatial {RDF} data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1271--1282", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The RDF data model has recently been extended to support representation and querying of spatial information (i.e., locations and geometries), which is associated with RDF entities. Still, there are limited efforts towards extending RDF stores to efficiently support spatial queries, such as range selections (e.g., find entities within a given range) and spatial joins (e.g., find pairs of entities whose locations are close to each other). In this paper, we propose an extension for RDF stores that supports efficient spatial data management. Our contributions include an effective encoding scheme for entities having spatial locations, the introduction of on-the-fly spatial filters and spatial join algorithms, and several optimizations that minimize the overhead of geometry and dictionary accesses. We implemented the proposed techniques as an extension to the opensource RDF-3X engine and we experimentally evaluated them using real RDF knowledge bases. The results show that our system offers robust performance for spatial queries, while introducing little overhead to the original query engine.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2014:DSM, author = "Ce Zhang and Christopher R{\'e}", title = "{DimmWitted}: a study of main-memory statistical analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1283--1294", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We perform the first study of the tradeoff space of access methods and replication to support statistical analytics using first-order methods executed in the main memory of a Non-Uniform Memory Access (NUMA) machine. Statistical analytics systems differ from conventional SQL-analytics in the amount and types of memory incoherence that they can tolerate. Our goal is to understand tradeoffs in accessing the data in row- or column-order and at what granularity one should share the model and data for a statistical task. We study this new tradeoff space and discover that there are tradeoffs between hardware and statistical efficiency. We argue that our tradeoff study may provide valuable information for designers of analytics engines: for each system we consider, our prototype engine can run at least one popular task at least 100$ \times $ faster. We conduct our study across five architectures using popular models, including SVMs, logistic regression, Gibbs sampling, and neural networks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Floratou:2014:SHF, author = "Avrilia Floratou and Umar Farooq Minhas and Fatma {\"O}zcan", title = "{SQL-on-Hadoop}: full circle back to shared-nothing database architectures", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1295--1306", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "SQL query processing for analytics over Hadoop data has recently gained significant traction. Among many systems providing some SQL support over Hadoop, Hive is the first native Hadoop system that uses an underlying framework such as MapReduce or Tez to process SQL-like statements. Impala, on the other hand, represents the new emerging class of SQL-on-Hadoop systems that exploit a shared-nothing parallel database architecture over Hadoop. Both systems optimize their data ingestion via columnar storage, and promote different file formats: ORC and Parquet. In this paper, we compare the performance of these two systems by conducting a set of cluster experiments using a TPC-H like benchmark and two TPC-DS inspired workloads. We also closely study the I/O efficiency of their columnar formats using a set of micro-benchmarks. Our results show that Impala is 3.3 X to 4.4 X faster than Hive on MapReduce and 2.1 X to 2.8 X than Hive on Tez for the overall TPC-H experiments. Impala is also 8.2 X to 10 X faster than Hive on MapReduce and about 4.3 X faster than Hive on Tez for the TPC-DS inspired experiments. Through detailed analysis of experimental results, we identify the reasons for this performance gap and examine the strengths and limitations of each system.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Guarnieri:2014:OSA, author = "Marco Guarnieri and David Basin", title = "Optimal security-aware query processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "12", pages = "1307--1318", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:26 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Security-Aware Query Processing is the problem of computing answers to queries in the presence of access control policies. We present general impossibility results for the existence of optimal algorithms for Security-Aware Query Processing and classify query languages for which such algorithms exist. In particular, we show that for the relational calculus there are no optimal algorithms, whereas optimal algorithms exist for some of its fragments, such as the existential fragment. We also establish relationships between two different models of Fine-Grained Access Control, called Truman and Non-Truman models, which have been previously presented in the literature as distinct. For optimal Security-Aware Query Processing, we show that the Non-Truman model is a special case of the Truman model for boolean queries in the relational calculus, moreover the two models coincide for more powerful languages, such as the relational calculus with aggregation operators. In contrast, these two models are distinct for non-boolean queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shi:2014:MTE, author = "Juwei Shi and Jia Zou and Jiaheng Lu and Zhao Cao and Shiqiang Li and Chen Wang", title = "{MRTuner}: a toolkit to enable holistic optimization for {MapReduce} jobs", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1319--1330", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "MapReduce based data-intensive computing solutions are increasingly deployed as production systems. Unlike Internet companies who invent and adopt the technology from the very beginning, traditional enterprises demand easy-to-use software due to the limited capabilities of administrators. Automatic job optimization software for MapReduce is a promising technique to satisfy such requirements. In this paper, we introduce a toolkit from IBM, called MRTuner, to enable holistic optimization for MapReduce jobs. In particular, we propose a novel Producer-Transporter-Consumer (PTC) model, which characterizes the tradeoffs in the parallel execution among tasks. We also carefully investigate the complicated relations among about twenty parameters, which have significant impact on the job performance. We design an efficient search algorithm to find the optimal execution plan. Finally, we conduct a thorough experimental evaluation on two different types of clusters using the HiBench suite which covers various Hadoop workloads from GB to TB size levels. The results show that the search latency of MRTuner is a few orders of magnitude faster than that of the state-of-the-art cost-based optimizer, and the effectiveness of the optimized execution plan is also significantly improved.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sadoghi:2014:RDL, author = "Mohammad Sadoghi and Mustafa Canim and Bishwaranjan Bhattacharjee and Fabian Nagel and Kenneth A. Ross", title = "Reducing database locking contention through multi-version concurrency", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1331--1342", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In multi-version databases, updates and deletions of records by transactions require appending a new record to tables rather than performing in-place updates. This mechanism incurs non-negligible performance overhead in the presence of multiple indexes on a table, where changes need to be propagated to all indexes. Additionally, an uncommitted record update will block other active transactions from using the index to fetch the most recently committed values for the updated record. In general, in order to support snapshot isolation and/or multi-version concurrency, either each active transaction is forced to search a database temporary area (e.g., roll-back segments) to fetch old values of desired records, or each transaction is forced to scan the entire table to find the older versions of the record in a multi-version database (in the absence of specialized temporal indexes). In this work, we describe a novel kV-Indirection structure to enable efficient (parallelizable) optimistic and pessimistic multi-version concurrency control by utilizing the old versions of records (at most two versions of each record) to provide direct access to the recent changes of records without the need of temporal indexes. As a result, our technique results in higher degree of concurrency by reducing the clashes between readers and writers of data and avoiding extended lock delays. We have a working prototype of our concurrency model and kV-Indirection structure in a commercial database and conducted an extensive evaluation to demonstrate the benefits of our multi-version concurrency control, and we obtained orders of magnitude speed up over the single-version concurrency control.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Su:2014:CEM, author = "Xueyuan Su and Garret Swart and Brian Goetz and Brian Oliver and Paul Sandoz", title = "Changing engines in midstream: a {Java} stream computational model for big data processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1343--1354", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the addition of lambda expressions and the Stream API in Java 8, Java has gained a powerful and expressive query language that operates over in-memory collections of Java objects, making the transformation and analysis of data more convenient, scalable and efficient. In this paper, we build on Java 8 Stream and add a DistributableStream abstraction that supports federated query execution over an extensible set of distributed compute engines. Each query eventually results in the creation of a materialized result that is returned either as a local object or as an engine defined distributed Java Collection that can be saved and/or used as a source for future queries. Distinctively, DistributableStream supports the changing of compute engines both between and within a query, allowing different parts of a computation to be executed on different platforms. At execution time, the query is organized as a sequence of pipelined stages, each stage potentially running on a different engine. Each node that is part of a stage executes its portion of the computation on the data available locally or produced by the previous stage of the computation. This approach allows for computations to be assigned to engines based on pricing, data locality, and resource availability. Coupled with the inherent laziness of stream operations, this brings great flexibility to query planning and separates the semantics of the query from the details of the engine used to execute it. We currently support three engines, Local, Apache Hadoop MapReduce and Oracle Coherence, and we illustrate how new engines and data sources can be added.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lee:2014:JEP, author = "Jae-Gil Lee and Gopi Attaluri and Ronald Barber and Naresh Chainani and Oliver Draese and Frederick Ho and Stratos Idreos and Min-Soo Kim and Sam Lightstone and Guy Lohman and Konstantinos Morfonios and Keshava Murthy and Ippokratis Pandis and Lin Qiao and Vijayshankar Raman and Vincent Kulandai Samy and Richard Sidle and Knut Stolze and Liping Zhang", title = "Joins on encoded and partitioned data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1355--1366", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Compression has historically been used to reduce the cost of storage, I/Os from that storage, and buffer pool utilization, at the expense of the CPU required to decompress data every time it is queried. However, significant additional CPU efficiencies can be achieved by deferring decompression as late in query processing as possible and performing query processing operations directly on the still-compressed data. In this paper, we investigate the benefits and challenges of performing joins on compressed (or encoded) data. We demonstrate the benefit of independently optimizing the compression scheme of each join column, even though join predicates relating values from multiple columns may require translation of the encoding of one join column into the encoding of the other. We also show the benefit of compressing ``payload'' data other than the join columns ``on the fly,'' to minimize the size of hash tables used in the join. By partitioning the domain of each column and defining separate dictionaries for each partition, we can achieve even better overall compression as well as increased flexibility in dealing with new values introduced by updates. Instead of decompressing both join columns participating in a join to resolve their different compression schemes, our system performs a light-weight mapping of only qualifying rows from one of the join columns to the encoding space of the other at run time. Consequently, join predicates can be applied directly on the compressed data. We call this procedure encoding translation. Two alternatives of encoding translation are developed and compared in the paper. We provide a comprehensive evaluation of these alternatives using product implementations of each on the TPC-H data set, and demonstrate that performing joins on encoded and partitioned data achieves both superior performance and excellent compression.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Poess:2014:TFI, author = "Meikel Poess and Tilmann Rabl and Hans-Arno Jacobsen and Brian Caufield", title = "{TPC--DI}: the first industry benchmark for data integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1367--1378", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Historically, the process of synchronizing a decision support system with data from operational systems has been referred to as Extract, Transform, Load (ETL) and the tools supporting such process have been referred to as ETL tools. Recently, ETL was replaced by the more comprehensive acronym, data integration (DI). DI describes the process of extracting and combining data from a variety of data source formats, transforming that data into a unified data model representation and loading it into a data store. This is done in the context of a variety of scenarios, such as data acquisition for business intelligence, analytics and data warehousing, but also synchronization of data between operational applications, data migrations and conversions, master data management, enterprise data sharing and delivery of data services in a service-oriented architecture context, amongst others. With these scenarios relying on up-to-date information it is critical to implement a highly performing, scalable and easy to maintain data integration system. This is especially important as the complexity, variety and volume of data is constantly increasing and performance of data integration systems is becoming very critical. Despite the significance of having a highly performing DI system, there has been no industry standard for measuring and comparing their performance. The TPC, acknowledging this void, has released TPC-DI, an innovative benchmark for data integration. This paper motivates the reasons behind its development, describes its main characteristics including workload, run rules, metric, and explains key decisions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gupta:2014:RTT, author = "Pankaj Gupta and Venu Satuluri and Ajeet Grewal and Siva Gurumurthy and Volodymyr Zhabiuk and Quannan Li and Jimmy Lin", title = "Real-time {Twitter} recommendation: online motif detection in large dynamic graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1379--1380", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We describe a production Twitter system for generating relevant, personalized, and timely recommendations based on observing the temporally-correlated actions of each user's followings. The system currently serves millions of recommendations daily to tens of millions of mobile users. The approach can be viewed as a specific instance of the novel problem of online motif detection in large dynamic graphs. Our current solution partitions the graph across a number of machines, and with the construction of appropriate data structures, motif detection can be translated into the lookup and intersection of adjacency lists in each partition. We conclude by discussing a generalization of the problem that perhaps represents a new class of data management systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cha:2014:IDN, author = "Sang K. Cha and Kunsoo Park and Changbin Song and Kihong Kim and Cheol Ryu and Sunho Lee", title = "Interval disaggregate: a new operator for business planning", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1381--1392", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Business planning as well as analytics on top of large-scale database systems is valuable to decision makers, but planning operations known and implemented so far are very basic. In this paper we propose a new planning operation called interval disaggregate, which goes as follows. Suppose that the planner, typically the management of a company, plans sales revenues of its products in the current year. An interval of the expected revenue for each product in the current year is computed from historical data in the database as the prediction interval of linear regression on the data. A total target revenue for the current year is given by the planner. The goal of the interval disaggregate operation is to find an appropriate disaggregation of the target revenue, considering the intervals. We formulate the problem of interval disaggregation more precisely and give solutions for the problem. Multidimensional geometry plays a crucial role in the problem formulation and the solutions. We implemented interval disaggregation into the planning engine of SAP HANA and did experiments on real-world data. Our experiments show that interval disaggregation gives more appropriate solutions with respect to historical data than the known basic disaggregation called referential disaggregation. We also show that interval disaggregation can be combined with the deseasonalization technique when the dataset shows seasonal fluctuations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2014:FFT, author = "Zhuo Zhang and Chao Li and Yangyu Tao and Renyu Yang and Hong Tang and Jie Xu", title = "{Fuxi}: a fault-tolerant resource management and job scheduling system at {Internet} scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1393--1404", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Scalability and fault-tolerance are two fundamental challenges for all distributed computing at Internet scale. Despite many recent advances from both academia and industry, these two problems are still far from settled. In this paper, we present Fuxi, a resource management and job scheduling system that is capable of handling the kind of workload at Alibaba where hundreds of terabytes of data are generated and analyzed everyday to help optimize the company's business operations and user experiences. We employ several novel techniques to enable Fuxi to perform efficient scheduling of hundreds of thousands of concurrent tasks over large clusters with thousands of nodes: (1) an incremental resource management protocol that supports multi-dimensional resource allocation and data locality; (2) user-transparent failure recovery where failures of any Fuxi components will not impact the execution of user jobs; and (3) an effective detection mechanism and a multi-level blacklisting scheme that prevents them from affecting job execution. Our evaluation results demonstrate that 95\% and 91\% scheduled CPU/memory utilization can be fulfilled under synthetic workloads, and Fuxi is capable of achieving 2.36T-B/minute throughput in GraySort. Additionally, the same Fuxi job only experiences approximately 16\% slowdown under a 5\% fault-injection rate. The slowdown only grows to 20\% when we double the fault-injection rate to 10\%. Fuxi has been deployed in our production environment since 2009, and it now manages hundreds of thousands of server nodes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Simmen:2014:LSG, author = "David Simmen and Karl Schnaitter and Jeff Davis and Yingjie He and Sangeet Lohariwala and Ajay Mysore and Vinayak Shenoi and Mingfeng Tan and Yu Xiao", title = "Large-scale graph analytics in {Aster 6}: bringing context to big data discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1405--1416", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph analytics is an important big data discovery technique. Applications include identifying influential employees for retention, detecting fraud in a complex interaction network, and determining product affinities by exploiting community buying patterns. Specialized platforms have emerged to satisfy the unique processing requirements of large-scale graph analytics; however, these platforms do not enable graph analytics to be combined with other analytics techniques, nor do they work well with the vast ecosystem of SQL-based business applications. Teradata Aster 6.0 adds support for large-scale graph analytics to its repertoire of analytics capabilities. The solution extends the multi-engine processing architecture with support for bulk synchronous parallel execution, and a specialized graph engine that enables iterative analysis of graph structures. Graph analytics functions written to the vertex-oriented API exposed by the graph engine can be invoked from the context of an SQL query and composed with existing SQL-MR functions, thereby enabling data scientists and business applications to express computations that combine large-scale graph analytics with techniques better suited to a different style of processing. The solution includes a suite of pre-built graph analytic functions adapted for parallel execution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2014:FFK, author = "Zhimin Chen and Vivek Narasayya and Surajit Chaudhuri", title = "Fast foreign-key detection in {Microsoft SQL} server {PowerPivot} for {Excel}", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1417--1428", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Microsoft SQL Server PowerPivot for Excel, or PowerPivot for short, is an in-memory business intelligence (BI) engine that enables Excel users to interactively create pivot tables over large data sets imported from sources such as relational databases, text files and web data feeds. Unlike traditional pivot tables in Excel that are defined on a single table, PowerPivot allows analysis over multiple tables connected via foreign-key joins. In many cases however, these foreign-key relationships are not known a priori, and information workers are often not be sophisticated enough to define these relationships. Therefore, the ability to automatically discover foreign-key relationships in PowerPivot is valuable, if not essential. The key challenge is to perform this detection interactively and with high precision even when data sets scale to hundreds of millions of rows and the schema contains tens of tables and hundreds of columns. In this paper, we describe techniques for fast foreign-key detection in PowerPivot and experimentally evaluate its accuracy, performance and scale on both synthetic benchmarks and real-world data sets. These techniques have been incorporated into PowerPivot for Excel.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yu:2014:BDS, author = "Meng-Chieh Yu and Tong Yu and Shao-Chen Wang and Chih-Jen Lin and Edward Y. Chang", title = "Big data small footprint: the design of a low-power classifier for detecting transportation modes", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1429--1440", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Sensors on mobile phones and wearables, and in general sensors on IoT (Internet of Things), bring forth a couple of new challenges to big data research. First, the power consumption for analyzing sensor data must be low, since most wearables and portable devices are power-strapped. Second, the velocity of analyzing big data on these devices must be high, otherwise the limited local storage may overflow. This paper presents our hardware-software co-design of a classifier for wearables to detect a person's transportation mode (i.e., still, walking, running, biking, and on a vehicle). We particularly focus on addressing the big-data small-footprint requirement by designing a classifier that is low in both computational complexity and memory requirement. Together with a sensor-hub configuration, we are able to drastically reduce power consumption by 99\%, while maintaining competitive mode-detection accuracy. The data used in the paper is made publicly available for conducting research.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Boykin:2014:SFI, author = "Oscar Boykin and Sam Ritchie and Ian O'Connell and Jimmy Lin", title = "{Summingbird}: a framework for integrating batch and online {MapReduce} computations", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1441--1451", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Summingbird is an open-source domain-specific language implemented in Scala and designed to integrate online and batch MapReduce computations in a single framework. Summingbird programs are written using dataflow abstractions such as sources, sinks, and stores, and can run on different execution platforms: Hadoop for batch processing (via Scalding/Cascading) and Storm for online processing. Different execution modes require different bindings for the dataflow abstractions (e.g., HDFS files or message queues for the source) but do not require any changes to the program logic. Furthermore, Summingbird can operate in a hybrid processing mode that transparently integrates batch and online results to efficiently generate up-to-date aggregations over long time spans. The language was designed to improve developer productivity and address pain points in building analytics solutions at Twitter where often, the same code needs to be written twice (once for batch processing and again for online processing) and indefinitely maintained in parallel. Our key insight is that certain algebraic structures provide the theoretical foundation for integrating batch and online processing in a seamless fashion. This means that Summingbird imposes constraints on the types of aggregations that can be performed, although in practice we have not found these constraints to be overly restrictive for a broad range of analytics tasks at Twitter.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ahmed:2014:SBT, author = "Rafi Ahmed and Rajkumar Sen and Meikel Poess and Sunil Chakkappen", title = "Of snowstorms and bushy trees", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1452--1461", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many workloads for analytical processing in commercial RDBMSs are dominated by snowstorm queries, which are characterized by references to multiple large fact tables and their associated smaller dimension tables. This paper describes a technique for bushy join tree optimization for snowstorm queries in Oracle database system. This technique generates bushy join trees containing subtrees that produce substantially reduced sets of rows and, therefore, their joins with other subtrees are generally much more efficient than joins in the left-deep trees. The generation of bushy join trees within an existing commercial physical optimizer requires extensive changes to the optimizer. Further, the optimizer will have to consider a large join permutation search space to generate efficient bushy join trees. The novelty of the approach is that bushy join trees can be generated outside the physical optimizer using logical query transformation that explores a considerably pruned search space. The paper describes an algorithm for generating optimal bushy join trees for snowstorm queries using an existing query transformation framework. It also presents performance results for this optimization, which show significant execution time improvements.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Vemuri:2014:EPS, author = "Srinivas Vemuri and Maneesh Varshney and Krishna Puttaswamy and Rui Liu", title = "Execution primitives for scalable joins and aggregations in {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1462--1473", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Analytics on Big Data is critical to derive business insights and drive innovation in today's Internet companies. Such analytics involve complex computations on large datasets, and are typically performed on MapReduce based frameworks such as Hive and Pig. However, in our experience, these systems are still quite limited in performing at scale. In particular, calculations that involve complex joins and aggregations, e.g. statistical calculations, scale poorly on these systems. In this paper we propose novel primitives for scaling such calculations. We propose a new data model for organizing datasets into calculation data units that are organized based on user-defined cost functions. We propose new operators that take advantage of these organized data units to significantly speed up joins and aggregations. Finally, we propose strategies for dividing the aggregation load uniformly across worker processes that are very effective in avoiding skews and reducing (or in some cases even removing) the associated overheads. We have implemented all our proposed primitives in a framework called Rubix, which has been in production at LinkedIn for nearly a year. Rubix powers several applications and processes TBs of data each day. We have seen remarkable improvements in speed and cost of complex calculations due to these primitives.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Arauz:2014:CLT, author = "Javier Arauz", title = "{CAP} limits in telecom subscriber database design", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1474--1483", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "While the notion of a Distributed DBMS has been familiar to the IT industry for several decades, within telecom networks the subscriber data management based on DDBMS technology is a novel addition to a service provider's infrastructure. Service providers are used to telecom networks that are efficient, reliable and easy to maintain and operate, in part thanks to the node model used in designing such networks. A DDBMS spanning a large geographical area however incurs into distributed systems issues not previously seen in telecom networks. Identifying and delivering the right set of trade-offs that satisfies the service providers' needs while staying within the known physical bounds of a distributed system is therefore crucial if DDBMS are to conquer the subscriber management space within telecom networks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bruno:2014:AJS, author = "Nicolas Bruno and YongChul Kwon and Ming-Chuan Wu", title = "Advanced join strategies for large-scale distributed computation", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1484--1495", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Companies providing cloud-scale data services have increasing needs to store and analyze massive data sets (e.g., search logs, click streams, and web graph data). For cost and performance reasons, processing is typically done on large clusters of thousands of commodity machines by using high level scripting languages. In the recent past, there has been significant progress in adapting well-known techniques from traditional relational DBMSs to this new scenario. However, important challenges remain open. In this paper we study the very common join operation, discuss some unique challenges in the large-scale distributed scenario, and explain how to efficiently and robustly process joins in a distributed way. Specifically, we introduce novel execution strategies that leverage opportunities not available in centralized scenarios, and others that robustly handle data skew. We report experimental validations of our approaches on Scope production clusters, which power the Applications and Services Group at Microsoft.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2014:DSG, author = "Yue Liu and Songlin Hu and Tilmann Rabl and Wantao Liu and Hans-Arno Jacobsen and Kaifeng Wu and Jian Chen and Jintao Li", title = "{DGFIndex} for smart grid: enhancing {Hive} with a cost-effective multidimensional range index", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1496--1507", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In Smart Grid applications, as the number of deployed electric smart meters increases, massive amounts of valuable meter data is generated and collected every day. To enable reliable data collection and make business decisions fast, high throughput storage and high-performance analysis of massive meter data become crucial for grid companies. Considering the advantage of high efficiency, fault tolerance, and price-performance of Hadoop and Hive systems, they are frequently deployed as underlying platform for big data processing. However, in real business use cases, these data analysis applications typically involve multidimensional range queries (MDRQ) as well as batch reading and statistics on the meter data. While Hive is high-performance at complex data batch reading and analysis, it lacks efficient indexing techniques for MDRQ. In this paper, we propose DGFIndex, an index structure for Hive that efficiently supports MDRQ for massive meter data. DGFIndex divides the data space into cubes using the grid file technique. Unlike the existing indexes in Hive, which stores all combinations of multiple dimensions, DGFIndex only stores the information of cubes. This leads to smaller index size and faster query processing. Furthermore, with pre-computing user-defined aggregations of each cube, DGFIndex only needs to access the boundary region for aggregation query. Our comprehensive experiments show that DGFIndex can save significant disk space in comparison with the existing indexes in Hive and the query performance with DGFIndex is 2-50 times faster than existing indexes in Hive and HadoopDB for aggregation query, 2-5 times faster than both for non-aggregation query, 2-75 times faster than scanning the whole table in different query selectivity.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yan:2014:EBS, author = "Ying Yan and Liang Jeff Chen and Zheng Zhang", title = "Error-bounded sampling for analytics on big sparse data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1508--1519", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Aggregation queries are at the core of business intelligence and data analytics. In the big data era, many scalable shared-nothing systems have been developed to process aggregation queries over massive amount of data. Microsoft's SCOPE is a well-known instance in this category. Nevertheless, aggregation queries are still expensive, because query processing needs to consume the entire data set, which is often hundreds of terabytes. Data sampling is a technique that samples a small portion of data to process and returns an approximate result with an error bound, thereby reducing the query's execution time. While similar problems were studied in the database literature, we encountered new challenges that disable most of prior efforts: (1) error bounds are dictated by end users and cannot be compromised, (2) data is sparse, meaning data has a limited population but a wide range. For such cases, conventional uniform sampling often yield high sampling rates and thus deliver limited or no performance gains. In this paper, we propose error-bounded stratified sampling to reduce sample size. The technique relies on the insight that we may only reduce the sampling rate with the knowledge of data distributions. The technique has been implemented into Microsoft internal search query platform. Results show that the proposed approach can reduce up to 99\% sample size comparing with uniform sampling, and its performance is robust against data volume and other key performance metrics.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gankidi:2014:IHD, author = "Vinitha Reddy Gankidi and Nikhil Teletia and Jignesh M. Patel and Alan Halverson and David J. DeWitt", title = "Indexing {HDFS} data in {PDW}: splitting the data from the index", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1520--1528", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "There is a growing interest in making relational DBMSs work synergistically with MapReduce systems. However, there are interesting technical challenges associated with figuring out the right balance between the use and co-deployment of these systems. This paper focuses on one specific aspect of this balance, namely how to leverage the superior indexing and query processing power of a relational DBMS for data that is often more cost-effectively stored in Hadoop/HDFS. We present a method to use conventional B+-tree indices in an RDBMS for data stored in HDFS and demonstrate that our approach is especially effective for highly selective queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sun:2014:CLS, author = "Chong Sun and Narasimhan Rampalli and Frank Yang and AnHai Doan", title = "{Chimera}: large-scale classification using machine learning, rules, and crowdsourcing", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1529--1540", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large-scale classification is an increasingly critical Big Data problem. So far, however, very little has been published on how this is done in practice. In this paper we describe Chimera, our solution to classify tens of millions of products into 5000+ product types at WalmartLabs. We show that at this scale, many conventional assumptions regarding learning and crowdsourcing break down, and that existing solutions cease to work. We describe how Chimera employs a combination of learning, rules (created by in-house analysts), and crowdsourcing to achieve accurate, continuously improving, and cost-effective classification. We discuss a set of lessons learned for other similar Big Data systems. In particular, we argue that at large scales crowdsourcing is critical, but must be used in combination with learning, rules, and in-house analysts. We also argue that using rules (in conjunction with learning) is a must, and that more research attention should be paid to helping analysts create and manage (tens of thousands of) rules more effectively.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bonifati:2014:IJQ, author = "Angela Bonifati and Radu Ciucanu and S{\L}Awek Staworko", title = "Interactive join query inference with {JIM}", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1541--1544", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Specifying join predicates may become a cumbersome task in many situations e.g., when the relations to be joined come from disparate data sources, when the values of the attributes carry little or no knowledge of metadata, or simply when the user is unfamiliar with querying formalisms. Such task is recurrent in many traditional data management applications, such as data integration, constraint inference, and database denormalization, but it is also becoming pivotal in novel crowdsourcing applications. We present Jim (Join Inference Machine), a system for interactive join specification tasks, where the user infers an $n$-ary join predicate by selecting tuples that are part of the join result via Boolean membership queries. The user can label tuples as positive or negative, while the system allows to identify and gray out the uninformative tuples i.e., those that do not add any information to the final learning goal. The tool also guides the user to reach her join inference goal with a minimal number of interactions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zheng:2014:MMS, author = "Yuxin Zheng and Zhifeng Bao and Lidan Shou and Anthony K. H. Tung", title = "{MESA}: a map service to support fuzzy type-ahead search over geo-textual data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1545--1548", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Geo-textual data are ubiquitous these days. Recent study on spatial keyword search focused on the processing of queries which retrieve objects that match certain keywords within a spatial region. To ensure effective data retrieval, various extensions were done including the tolerance of errors in keyword matching and the search-as-you-type feature using prefix matching. We present MESA, a map application to support different variants of spatial keyword query. In this demonstration, we adopt the autocompletion paradigm that generates the initial query as a prefix matching query. If there are few matching results, other variants are performed as a form of relaxation that reuses the processing done in earlier phases. The types of relaxation allowed include spatial region expansion and exact/approximate prefix/substring matching. MESA adopts the client-server architecture. It provides fuzzy type-ahead search over geo-textual data. The core of MESA is to adopt a unifying search strategy, which incrementally applies the relaxation in an appropriate order to maximize the efficiency of query processing. In addition, MESA equips a user-friendly interface to interact with users and visualize results. MESA also provides customized search to meet the needs of different users.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2014:RRT, author = "Henan Wang and Guoliang Li and Huiqi Hu and Shuo Chen and Bingwen Shen and Hao Wu and Wen-Syan Li and Kian-Lee Tan", title = "{R3}: a real-time route recommendation system", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1549--1552", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Existing route recommendation systems have two main weaknesses. First, they usually recommend the same route for all users and cannot help control traffic jam. Second, they do not take full advantage of real-time traffic to recommend the best routes. To address these two problems, we develop a real-time route recommendation system, called R3, aiming to provide users with the real-time-traffic-aware routes. R3 recommends diverse routes for different users to alleviate the traffic pressure. R3 utilizes historical taxi driving data and real-time traffic data and integrates them together to provide users with real-time route recommendation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Benedikt:2014:PPD, author = "Michael Benedikt and Julien Leblay and Efthymia Tsamoura", title = "{PDQ}: proof-driven query answering over {Web}-based data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1553--1556", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The data needed to answer queries is often available through Web-based APIs. Indeed, for a given query there may be many Web-based sources which can be used to answer it, with the sources overlapping in their vocabularies, and differing in their access restrictions (required arguments) and cost. We introduce PDQ (Proof-Driven Query Answering), a system for determining a query plan in the presence of web-based sources. It is: (i) constraint-aware --- exploiting relationships between sources to rewrite an expensive query into a cheaper one, (ii) access-aware --- abiding by any access restrictions known in the sources, and (iii) cost-aware --- making use of any cost information that is available about services. PDQ takes the novel approach of generating query plans from proofs that a query is answerable. We demonstrate the use of PDQ and its effectiveness in generating low-cost plans.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hassan:2014:DFA, author = "Naeemul Hassan and Afroza Sultana and You Wu and Gensheng Zhang and Chengkai Li and Jun Yang and Cong Yu", title = "Data in, fact out: automated monitoring of facts by {FactWatcher}", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1557--1560", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Towards computational journalism, we present FactWatcher, a system that helps journalists identify data-backed, attention-seizing facts which serve as leads to news stories. FactWatcher discovers three types of facts, including situational facts, one-of-the-few facts, and prominent streaks, through a unified suite of data model, algorithm framework, and fact ranking measure. Given an append-only database, upon the arrival of a new tuple, FactWatcher monitors if the tuple triggers any new facts. Its algorithms efficiently search for facts without exhaustively testing all possible ones. Furthermore, FactWatcher provides multiple features in striving for an end-to-end system, including fact ranking, fact-to-statement translation and keyword-based fact search.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yuan:2014:ODA, author = "Mingxuan Yuan and Ke Deng and Jia Zeng and Yanhua Li and Bing Ni and Xiuqiang He and Fei Wang and Wenyuan Dai and Qiang Yang", title = "{OceanST}: a distributed analytic system for large-scale spatiotemporal mobile broadband data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1561--1564", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the increasing prevalence of versatile mobile devices and the fast deployment of broadband mobile networks, a huge volume of Mobile Broadband (MBB) data has been generated over time. The MBB data naturally contain rich information of a large number of mobile users, covering a considerable fraction of whole population nowadays, including the mobile applications they are using at different locations and time; the MBB data may present the unprecedentedly large knowledge base of human behavior which has highly recognized commercial and social value. However, the storage, management and analysis of the huge and fast growing volume of MBB data post new and significant challenges to the industrial practitioners and research community. In this demonstration, we present a new, MBB data tailored, distributed analytic system named OceanST which has addressed a series of problems and weaknesses of the existing systems, originally designed for more general purpose and capable to handle MBB data to some extent. OceanST is featured by (i) efficiently loading of ever-growing MBB data, (ii) a bunch of spatiotemporal aggregate queries and basic analysis APIs frequently found in various MBB data application scenarios, and (iii) sampling-based approximate solution with provable accuracy bound to cope with huge volume of MBB data. The demonstration will show the advantage of OceanST in a cluster of 5 machines using 3TB data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Geerts:2014:TAF, author = "Floris Geerts and Giansalvatore Mecca and Paolo Papotti and Donatello Santoro", title = "That's all folks!: {Llunatic} goes open source", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1565--1568", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "It is widely recognized that whenever different data sources need to be integrated into a single target database errors and inconsistencies may arise, so that there is a strong need to apply data-cleaning techniques to repair the data. Despite this need, database research has so far investigated mappings and data repairing essentially in isolation. Unfortunately, schema-mappings and data quality rules interact with each other, so that applying existing algorithms in a pipelined way --- i.e., first exchange then data, then repair the result --- does not lead to solutions even in simple settings. We present the Llunatic mapping and cleaning system, the first comprehensive proposal to handle schema mappings and data repairing in a uniform way. Llunatic is based on the intuition that transforming and cleaning data are different facets of the same problem, unified by their declarative nature. This holistic approach allows us to incorporate unique features into the system, such as configurable user interaction and a tunable trade-off between efficiency and quality of the solutions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2014:HMA, author = "Weimo Liu and Saad Bin Suhaim and Saravanan Thirumuruganathan and Nan Zhang and Gautam Das and Ali Jaoua", title = "{HDBTracker}: monitoring the aggregates on dynamic hidden web databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1569--1572", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Numerous web databases, e.g., amazon.com, eBay.com, are ``hidden'' behind (i.e., accessible only through) their restrictive search and browsing interfaces. This demonstration showcases HDBTracker, a web-based system that reveals and tracks (the changes of) user-specified aggregate queries over such hidden web databases, especially those that are frequently updated, by issuing a small number of search queries through the public web interfaces of these databases. The ability to track and monitor aggregates has applications over a wide variety of domains --- e.g., government agencies can track COUNT of openings at online job hunting websites to understand key economic indicators, while businesses can track the AVG price of a product over a basket of e-commerce websites to understand the competitive landscape and/or material costs. A key technique used in HDBTracker is RS-ESTIMATOR, the first algorithm that can efficiently monitor changes to aggregate query answers over a hidden web database.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xia:2014:BBA, author = "Fan Xia and Ye Li and Chengcheng Yu and Haixin Ma and Weining Qian", title = "{BSMA}: a benchmark for analytical queries over social media data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1573--1576", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The demonstration of a benchmark, named as BSMA, for Benchmarking Social Media Analytics, is introduced in this paper. BSMA is designed to benchmark data management systems supporting analytical queries over social media. It is different to existing benchmarks in that: (1) Both real-life data and a synthetic data generator are provided. The real-life dataset contains a social network of 1.6 million users, and all their tweeting and retweeting activities. The data generator can generate both social networks and synthetic timelines that follow data distributions determined by predefined parameters. (2) A set of workloads are provided. The data generator is in responsible for producing updates. A workload generator produces queries based on predefined query templates by generating query arguments online. BSMA workloads cover a large amount of queries with graph operations, temporal queries, hotspot queries, and aggregate queries. Furthermore, the argument generator is capable of sampling data items in the timeline following power-law distribution online. (3) A toolkit is provided to measure and report the performance of systems that implement the benchmark. Furthermore, a prototype system based on dataset and workloads of BSMA is also implemented. The demonstration will include two parts, i.e. the internals of data and workload generator, as well as the performance testing of reference implementations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Petermann:2014:GBD, author = "Andr{\'e} Petermann and Martin Junghanns and Robert M{\"u}ller and Erhard Rahm", title = "Graph-based data integration and business intelligence with {BIIIG}", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1577--1580", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate BIIIG (Business Intelligence with Integrated Instance Graphs), a new system for graph-based data integration and analysis. It aims at improving business analytics compared to traditional OLAP approaches by comprehensively tracking relationships between entities and making them available for analysis. BIIIG supports a largely automatic data integration pipeline for metadata and instance data. Metadata from heterogeneous sources are integrated in a so-called Unified Metadata Graph (UMG) while instance data is combined in a single integrated instance graph (IIG). A unique feature of BIIIG is the concept of business transaction graphs, which are derived from the IIG and which reflect all steps involved in a specific business process. Queries and analysis tasks can refer to the entire instance graph or sets of business transaction graphs. In the demonstration, we perform all data integration steps and present analytic queries including pattern matching and graph-based aggregation of business measures.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Vartak:2014:SAG, author = "Manasi Vartak and Samuel Madden and Aditya Parameswaran and Neoklis Polyzotis", title = "{SeeDB}: automatically generating query visualizations", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1581--1584", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data analysts operating on large volumes of data often rely on visualizations to interpret the results of queries. However, finding the right visualization for a query is a laborious and time-consuming task. We demonstrate SeeDB, a system that partially automates this task: given a query, SeeDB explores the space of all possible visualizations, and automatically identifies and recommends to the analyst those visualizations it finds to be most ``interesting'' or ``useful''. In our demonstration, conference attendees will see SeeDB in action for a variety of queries on multiple real-world datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dutt:2014:QEA, author = "Anshuman Dutt and Sumit Neelam and Jayant R. Haritsa", title = "{QUEST}: an exploratory approach to robust query processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1585--1588", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lei:2014:RIR, author = "Chuan Lei and Zhongfang Zhuang and Elke A. Rundensteiner and Mohamed Y. Eltabakh", title = "Redoop infrastructure for recurring big data queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1589--1592", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This demonstration presents the Redoop infrastructure, the first full-fledged MapReduce framework with native support for recurring big data queries. Recurring queries, repeatedly being executed for long periods of time over evolving high-volume data, have become a bedrock component in most large-scale data analytic applications. Redoop is a comprehensive extension to Hadoop that pushes the support and optimization of recurring queries into Hadoop's core functionality. While backward compatible with regular MapReduce jobs, Redoop achieves an order of magnitude better performance than Hadoop for recurring workloads. Redoop employs innovative window-aware optimization techniques for such recurring workloads including adaptive window-aware data partitioning, cache-aware task scheduling, and inter-window caching mechanisms. We will demonstrate Redoop's capabilities on a compute cluster against real life workloads including click-stream and sensor data analysis.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Brucato:2014:PTP, author = "Matteo Brucato and Rahul Ramakrishna and Azza Abouzied and Alexandra Meliou", title = "{PackageBuilder}: from tuples to packages", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1593--1596", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demo, we present PackageBuilder, a system that extends database systems to support package queries. A package is a collection of tuples that individually satisfy base constraints and collectively satisfy global constraints. The need for package support arises in a variety of scenarios: For example, in the creation of meal plans, users are not only interested in the nutritional content of individual meals (base constraints), but also care to specify daily consumption limits and control the balance of the entire plan (global constraints). We introduce PaQL, a declarative SQL-based package query language, and the interface abstractions which allow users to interactively specify package queries and easily navigate through their results. To efficiently evaluate queries, the system employs pruning and heuristics, as well as state-of-the-art constraint optimization solvers. We demonstrate PackageBuilder by allowing attendees to interact with the system's interface, to define PaQL queries and to observe how query evaluation is performed.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Amsterdamer:2014:OAC, author = "Yael Amsterdamer and Susan B. Davidson and Tova Milo and Slava Novgorodov and Amit Somech", title = "Ontology assisted crowd mining", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1597--1600", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present OASSIS (for Ontology ASSISted crowd mining), a prototype system which allows users to declaratively specify their information needs, and mines the crowd for answers. The answers that the system computes are concise and relevant, and represent frequent, significant data patterns. The system is based on (1) a generic model that captures both ontological knowledge, as well as the individual knowledge of crowd members from which frequent patterns are mined; (2) a query language in which users can specify their information needs and types of data patterns they seek; and (3) an efficient query evaluation algorithm, for mining semantically concise answers while minimizing the number of questions posed to the crowd. We will demonstrate OASSIS using a couple of real-life scenarios, showing how users can formulate and execute queries through the OASSIS UI and how the relevant data is mined from the crowd.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2014:SSE, author = "Lisi Chen and Yan Cui and Gao Cong and Xin Cao", title = "{SOPS}: a system for efficient processing of spatial-keyword publish\slash subscribe", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1601--1604", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Massive amount of data that are geo-tagged and associated with text information are being generated at an unprecedented scale. These geo-textual data cover a wide range of topics. Users are interested in receiving up-to-date geo-textual objects (e.g., geo-tagged Tweets) such that their locations meet users' need and their texts are interesting to users. For example, a user may want to be updated with tweets near her home on the topic ``dengue fever headache''. AB@In this demonstration, we present SOPS, the Spatial-Keyword Publish/Subscribe System, that is capable of efficiently processing spatial keyword continuous queries. SOPS supports two types of queries: (1) Boolean Range Continuous (BRC) query that can be used to subscribe the geo-textual objects satisfying a boolean keyword expression and falling in a specified spatial region; (2) Temporal Spatial-Keyword Top-$k$ Continuous (TaSK) query that continuously maintains up-to-date top-$k$ most relevant results over a stream of geo-textual objects. SOPS enables users to formulate their queries and view the real-time results over a stream of geo-textual objects by browser-based user interfaces. On the server side, we propose solutions to efficiently processing a large number of BRC queries (tens of millions) and TaSK queries over a stream of geo-textual objects.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shirakawa:2014:MLI, author = "Masumi Shirakawa and Takahiro Hara and Shojiro Nishio", title = "{MLJ}: language-independent real-time search of tweets reported by media outlets and journalists", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1605--1608", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demonstration, we introduce MLJ (MultiLingual Journalism, http://mljournalism.com), a first Web-based system that enables users to search any topic of latest tweets posted by media outlets and journalists beyond languages. Handling multilingual tweets in real time involves many technical challenges: language barrier, sparsity of words, and real-time data stream. To overcome the language barrier and the sparsity of words, MLJ harnesses CL-ESA, a Wikipedia-based language-independent method to generate a vector of Wikipedia pages (entities) from an input text. To continuously deal with tweet stream, we propose one-pass DP-means, an online clustering method based on DP-means. Given a new tweet as an input, MLJ generates a vector using CL-ESA and classifies it into one of clusters using one-pass DP-means. By interpreting a search query as a vector, users can instantly search clusters containing latest related tweets from the query without being aware of language differences. MLJ as of March 2014 supports nine languages including English, Japanese, Korean, Spanish, Portuguese, German, French, Italian, and Arabic covering 24 countries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bress:2014:OHO, author = "Sebastian Bre{\ss} and Bastian K{\"o}cher and Max Heimel and Volker Markl and Michael Saecker and Gunter Saake", title = "{Ocelot\slash HyPE}: optimized data processing on heterogeneous hardware", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1609--1612", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The past years saw the emergence of highly heterogeneous server architectures that feature multiple accelerators in addition to the main processor. Efficiently exploiting these systems for data processing is a challenging research problem that comprises many facets, including how to find an optimal operator placement strategy, how to estimate runtime costs across different hardware architectures, and how to manage the code and maintenance blowup caused by having to support multiple architectures. In prior work, we already discussed solutions to some of these problems: First, we showed that specifying operators in a hardware-oblivious way can prevent code blowup while still maintaining competitive performance when supporting multiple architectures. Second, we presented learning cost functions and several heuristics to efficiently place operators across all available devices. In this demonstration, we provide further insights into this line of work by presenting our combined system Ocelot/HyPE. Our system integrates a hardware-oblivious data processing engine with a learning query optimizer for placement decisions, resulting in a highly adaptive DBMS that is specifically tailored towards heterogeneous hardware environments.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2014:MMO, author = "Fei Wu and Tobias Kin Hou Lei and Zhenhui Li and Jiawei Han", title = "{MoveMine 2.0}: mining object relationships from movement data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1613--1616", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The development in positioning technology has enabled us to collect a huge amount of movement data from moving objects, such as human, animals, and vehicles. The data embed rich information about the relationships among moving objects and have applications in many fields, e.g., in ecological study and human behavioral study. Previously, we have proposed a system MoveMine that integrates several start-of-art movement mining methods. However, it does not include recent methods on relationship pattern mining. Thus, we propose to extend MoveMine to MoveMine 2.0 by adding substantial new methods in mining dynamic relationship patterns. Newly added methods focus on two types of pairwise relationship patterns: (i) attraction/avoidance relationship, and (ii) following pattern. A user-friendly interface is designed to support interactive exploration of the result and provides flexibility in tuning parameters. MoveMine 2.0 is tested on multiple types of real datasets to ensure its practical use. Our system provides useful tools for domain experts to gain insights on real dataset. Meanwhile, it will promote further research in relationship mining from moving objects.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sun:2014:PFA, author = "Liwen Sun and Sanjay Krishnan and Reynold S. Xin and Michael J. Franklin", title = "A partitioning framework for aggressive data skipping", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1617--1620", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose to demonstrate a fine-grained partitioning framework that reorganizes the data tuples into small blocks at data loading time. The goal is to enable queries to maximally skip scanning data blocks. The partition framework consists of four steps: (1) workload analysis, which extracts features from a query workload, (2) augmentation, which augments each data tuple with a feature vector, (3) reduce, which succinctly represents a set of data tuples using a set of feature vectors, and (4) partitioning, which performs a clustering algorithm to partition the feature vectors and uses the clustering result to guide the actual data partitioning. Our experiments show that our techniques result in a 3-7x query response time improvement over traditional range partitioning due to more effective data skipping.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2014:IOE, author = "Lei Cao and Qingyang Wang and Elke A. Rundensteiner", title = "Interactive outlier exploration in big data streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1621--1624", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate our VSOutlier system for supporting interactive exploration of outliers in big data streams. VSOutlier not only supports a rich variety of outlier types supported by innovative and efficient outlier detection strategies, but also provides a rich set of interactive interfaces to explore outliers in real time. Using the stock transactions dataset from the US stock market and the moving objects dataset from MITRE, we demonstrate that the VSOutlier system enables analysts to more efficiently identify, understand, and respond to phenomena of interest in near real-time even when applied to high volume streams.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{To:2014:SAE, author = "Quoc-Cuong To and Benjamin Nguyen and Philippe Pucheral", title = "{SQL\slash AA}: executing {SQL} on an asymmetric architecture", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1625--1628", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Current applications, from complex sensor systems (e.g. quantified self) to online e-markets acquire vast quantities of personal information which usually end-up on central servers. This information represents an unprecedented potential for user customized applications and business (e.g., car insurance billing, carbon tax, traffic decongestion, resource optimization in smart grids, healthcare surveillance, participatory sensing). However, the PRISM affair has shown that public opinion is starting to wonder whether these new services are not bringing us closer to science fiction dystopias. It has become clear that centralizing and processing all one's data on a single server is a major problem with regards to privacy concerns. Conversely, decentralized architectures, devised to help individuals keep full control of their data, complexify global treatments and queries, often impeding the development of innovative services and applications.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2014:GGS, author = "Zhao Chen and Rui Fu and Ziyuan Zhao and Zheng Liu and Leihao Xia and Lei Chen and Peng Cheng and Caleb Chen Cao and Yongxin Tong and Chen Jason Zhang", title = "{gMission}: a general spatial crowdsourcing platform", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1629--1632", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As one of the successful forms of using Wisdom of Crowd, crowdsourcing, has been widely used for many human intrinsic tasks, such as image labeling, natural language understanding, market predication and opinion mining. Meanwhile, with advances in pervasive technology, mobile devices, such as mobile phones and tablets, have become extremely popular. These mobile devices can work as sensors to collect multimedia data(audios, images and videos) and location information. This power makes it possible to implement the new crowdsourcing mode: spatial crowdsourcing. In spatial crowdsourcing, a requester can ask for resources related a specific location, the mobile users who would like to take the task will travel to that place and get the data. Due to the rapid growth of mobile device uses, spatial crowdsourcing is likely to become more popular than general crowdsourcing, such as Amazon Turk and Crowdflower. However, to implement such a platform, effective and efficient solutions for worker incentives, task assignment, result aggregation and data quality control must be developed. In this demo, we will introduce gMission, a general spatial crowdsourcing platform, which features with a collection of novel techniques, including geographic sensing, worker detection, and task recommendation. We introduce the sketch of system architecture and illustrate scenarios via several case analysis.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cetintemel:2014:SSN, author = "Ugur Cetintemel and Jiang Du and Tim Kraska and Samuel Madden and David Maier and John Meehan and Andrew Pavlo and Michael Stonebraker and Erik Sutherland and Nesime Tatbul and Kristin Tufte and Hao Wang and Stanley Zdonik", title = "{S-Store}: a streaming {NewSQL} system for big velocity applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1633--1636", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "First-generation streaming systems did not pay much attention to state management via ACID transactions (e.g., [3, 4]). S-Store is a data management system that combines OLTP transactions with stream processing. To create S-Store, we begin with H-Store, a main-memory transaction processing engine, and add primitives to support streaming. This includes triggers and transaction workflows to implement push-based processing, windows to provide a way to bound the computation, and tables with hidden state to implement scoping for proper isolation. This demo explores the benefits of this approach by showing how a na{\"\i}ve implementation of our benchmarks using only H-Store can yield incorrect results. We also show that by exploiting push-based semantics and our implementation of triggers, we can achieve significant improvement in transaction throughput. We demo two modern applications: (i) leaderboard maintenance for a version of ``American Idol'', and (ii) a city-scale bicycle rental scenario.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xie:2014:CRT, author = "Runquan Xie and Feida Zhu and Hui Ma and Wei Xie and Chen Lin", title = "{CLEar}: a real-time online observatory for bursty and viral events", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1637--1640", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We describe our demonstration of CLEar (CLairaudient Ear), a real-time online platform for detecting, monitoring, summarizing, contextualizing and visualizing bursty and viral events, those triggering a sudden surge of public interest and going viral on micro-blogging platforms. This task is challenging for existing methods as they either use complicated topic models to analyze topics in a off-line manner or define temporal structure of fixed granularity on the data stream for online topic learning, leaving them hardly scalable for real-time stream like that of Twitter. In this demonstration of CLEar, we present a three-stage system: First, we show a real-time bursty event detection module based on a data-sketch topic model which makes use of acceleration of certain stream quantities as the indicators of topic burstiness to trigger efficient topic inference. Second, we demonstrate popularity prediction for the detected bursty topics and event summarization based on clustering related topics detected in successive time periods. Third, we illustrate CLEar's module for contextualizing and visualizing the event evolution both along time-line and across other news media to offer an easier understanding of the events.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Suh:2014:ALI, author = "Young-Kyoon Suh and Richard T. Snodgrass and Rui Zhang", title = "{AZDBLab}: a laboratory information system for large-scale empirical {DBMS} studies", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1641--1644", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the database field, while very strong mathematical and engineering work has been done, the scientific approach has been much less prominent. The deep understanding of query optimizers obtained through the scientific approach can lead to better engineered designs. Unlike other domains, there have been few DBMS-dedicated laboratories, focusing on such scientific investigation. In this demonstration, we present a novel DBMS-oriented research infrastructure, called Arizona Database Laboratory (AZDBLab), to assist database researchers in conducting a large-scale empirical study across multiple DBMSes. For them to test their hypotheses on the behavior of query optimizers, AZDBLab can run and monitor a large-scale experiment with thousands (or millions) of queries on different DBMSes. Furthermore, AZDBLab can help users automatically analyze these queries. In the demo, the audience will interact with AZDBLab through the stand-alone application and the mobile app to conduct such a large-scale experiment for a study. The audience will then run a Tucson Timing Protocol analysis on the finished experiment and then see the analysis (data sanity check and timing) results.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2014:TTM, author = "Qi Wang and Manohar Kaul and Cheng Long and Raymond Chi-Wing Wong", title = "{Terrain-Toolkit}: a multi-functional tool for terrain data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1645--1648", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Terrain data is becoming increasingly popular both in industry and in academia. Many tools have been developed for visualizing terrain data. However, we find that (1) they usually accept very few data formats of terrain data only; (2) they do not support terrain simplification well which, as will be shown, is used heavily for query processing in spatial databases; and (3) they do not provide the surface distance operator which is fundamental for many applications based on terrain data. Motivated by this, we developed a tool called Terrain-Toolkit for terrain data which accepts a comprehensive set of data formats, supports terrain simplification and provides the surface distance operator.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fu:2014:FDC, author = "Yupeng Fu and Kian Win Ong and Yannis Papakonstantinou and Erick Zamora", title = "{Forward}: data-centric {ULS} using declarative templates that efficiently wrap third-party {JavaScript} components", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1649--1652", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "While Ajax programming and the plethora of JavaScript component libraries enable high-quality Uls in web applications, integrating them with page data is laborious and error-prone as a developer has to handcode incremental modifications with trigger-based programming and manual coordination of data dependencies. The FORWARD web framework simplifies the development of Ajax applications through declarative, state-based templates. This declarative, data-centric approach is characterized by the principle of logical/physical independence, which the database community has often deployed successfully. It enables FORWARD to leverage database techniques, such as incremental view maintenance, updatable views, capability-based component wrappers and cost-based optimization to automate efficient live visualizations. We demonstrate an end-to-end system implementation, including a web-based IDE (itself built in FORWARD), academic and commercial applications built in FORWARD and a wide variety of JavaScript components supported by the declarative templates.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lin:2014:SSP, author = "Xika Lin and Abhishek Mukherji and Elke A. Rundensteiner and Matthew O. Ward", title = "{SPIRE}: supporting parameter-driven interactive rule mining and exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1653--1656", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate our SPIRE technology for supporting interactive mining of both positive and negative rules at the speed of thought. It is often misleading to learn only about positive rules, yet extremely revealing to find strongly supported negative rules. Key technical contributions of SPIRE including region-wise abstractions of rules, positive-negative rule relationship analysis, rule redundancy management and rule visualization supporting novel exploratory queries will be showcased. The audience can interactively explore complex rule relationships in a visual manner, such as comparing negative rules with their positive counterparts, that would otherwise take prohibitive time. Overall, our SPIRE system provides data analysts with rich insights into rules and rule relationships while significantly reducing manual effort and time investment required.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Anderson:2014:IDE, author = "Michael R. Anderson and Michael Cafarella and Yixing Jiang and Guan Wang and Bochun Zhang", title = "An integrated development environment for faster feature engineering", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1657--1660", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The application of machine learning to large datasets has become a core component of many important and exciting software systems being built today. The extreme value in these trained systems is tempered, however, by the difficulty of constructing them. As shown by the experience of Google, Netflix, IBM, and many others, a critical problem in building trained systems is that of feature engineering. High-quality machine learning features are crucial for the system's performance but are difficult and time-consuming for engineers to develop. Data-centric developer tools that improve the productivity of feature engineers will thus likely have a large impact on an important area of work. We have built a demonstration integrated development environment for feature engineers. It accelerates one particular step in the feature engineering development cycle: evaluating the effectiveness of novel feature code. In particular, it uses an index and runtime execution planner to process raw data objects (e.g., Web pages) in order of descending likelihood that the data object will be relevant to the user's feature code. This demonstration IDE allows the user to write arbitrary feature code, evaluate its impact on learner quality, and observe exactly how much faster our technique performs compared to a baseline system.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xiong:2014:PSD, author = "Pengcheng Xiong and Hakan Hacig{\"u}m{\"u}s", title = "{Pronto}: a software-defined networking based system for performance management of analytical queries on distributed data stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1661--1664", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Nowadays data analytics applications are accessing more and more data from distributed data stores, creating large amount of data traffic on the network. Therefore, distributed analytic queries are prone to suffer from bad performance in terms of query execution time when they encounter a network resource contention, which is quite common in a shared network. Typical distributed query optimizers do not have a way to solve this problem because historically they have been treating the network underneath as a black-box: they are unable to monitor it, let alone to control it. However, we are entering a new era of software-defined networking (SDN), which provides visibility into and control of the network's state for the applications including distributed database systems. In this demonstration, we present a system, called Pronto that leverages the SDN capabilities for a distributed query processor to achieve performance improvement and differentiation for analytical queries. The system is the real implementation of our recently developed methods on commercial SDN products. The demonstration shows the shortcomings of a distributed query optimizer, which treats the underlying network as a black box, and the advantages of the SDN-based approach by allowing the users to selectively explore various relevant and interesting settings in a distributed query processing environment.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2014:GYB, author = "Rui Zhang and Reshu Jain and Prasenjit Sarkar and Lukas Rupprecht", title = "Getting your big data priorities straight: a demonstration of priority-based {QoS} using social-network-driven stock recommendation", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1665--1668", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As we come to terms with various big data challenges, one vital issue remains largely untouched. That is the optimal multiplexing and prioritization of different big data applications sharing the same underlying infrastructure, for example, a public cloud platform. Given these demanding applications and the necessary practice to avoid over-provisioning, resource contention between applications is inevitable. Priority must be given to important applications (or sub workloads in an application) in these circumstances. This demo highlights the compelling impact prioritization could make, using an example application that recommends promising combinations of stocks to purchase based on relevant Twitter sentiment. The application consists of a batch job and an interactive query, ran simultaneously. Our underlying solution provides a unique capability to identify and differentiate application workloads throughout a complex big data platform. Its current implementation is based on Apache Hadoop and the IBM GPFS distributed storage system. The demo showcases the superior interactive query performance achievable by prioritizing its workloads and thereby avoiding I/O bandwidth contention. The query time is 3.6 $ \times $ better compared to no prioritization. Such a performance is within 0.3\% of that of an idealistic system where the query runs without contention. The demo is conducted on around 3 months of Twitter data, pertinent to the S \& P 100 index, with about 4 $ \times $ 10$^{12}$ potential stock combinations considered.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jindal:2014:VYR, author = "Alekh Jindal and Praynaa Rawlani and Eugene Wu and Samuel Madden and Amol Deshpande and Mike Stonebraker", title = "{Vertexica}: your relational friend for graph analytics!", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1669--1672", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we present Vertexica, a graph analytics tools on top of a relational database, which is user friendly and yet highly efficient. Instead of constraining programmers to SQL, Vertexica offers a popular vertex-centric query interface, which is more natural for analysts to express many graph queries. The programmers simply provide their vertex-compute functions and Vertexica takes care of efficiently executing them in the standard SQL engine. The advantage of using Vertexica is its ability to leverage the relational features and enable much more sophisticated graph analysis. These include expressing graph algorithms which are difficult in vertex-centric but straightforward in SQL and the ability to compose end-to-end data processing pipelines, including pre- and post- processing of graphs as well as combining multiple algorithms for deeper insights. Vertexica has a graphical user interface and we outline several demonstration scenarios including, interactive graph analysis, complex graph analysis, and continuous and time series analysis.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Quamar:2014:NNC, author = "Abdul Quamar and Amol Deshpande and Jimmy Lin", title = "{NScale}: neighborhood-centric analytics on large graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1673--1676", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "There is an increasing interest in executing rich and complex analysis tasks over large-scale graphs, many of which require processing and reasoning about a large number of multi-hop neighborhoods or subgraphs in the graph. Examples of such tasks include ego network analysis, motif counting in biological networks, finding social circles, personalized recommendations, link prediction, anomaly detection, analyzing influence cascades, and so on. These tasks are not well served by existing vertex-centric graph processing frameworks whose computation and execution models limit the user program to directly access the state of a single vertex, resulting in high communication, scheduling, and memory overheads in executing such tasks. Further, most existing graph processing frameworks also typically ignore the challenges in extracting the relevant portions of the graph that an analysis task is interested in, and loading it onto distributed memory. In this demonstration proposal, we describe NScale, a novel end-to-end graph processing framework that enables the distributed execution of complex neighborhood-centric analytics over large-scale graphs in the cloud. NScale enables users to write programs at the level of neighborhoods or subgraphs. NScale uses Apache YARN for efficient and fault-tolerant distribution of data and computation; it features GEL, a novel graph extraction and loading phase, that extracts the relevant portions of the graph and loads them into distributed memory using as few machines as possible. NScale utilizes novel techniques for the distributed execution of user computation that minimize memory consumption by exploiting overlap among the neighborhoods of interest. A comprehensive experimental evaluation shows orders-of-magnitude improvements in performance and total cost over vertex-centric approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2014:DDP, author = "Haoran Li and Li Xiong and Lifan Zhang and Xiaoqian Jiang", title = "{DPSynthesizer}: differentially private data synthesizer for privacy preserving data sharing", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1677--1680", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Differential privacy has recently emerged in private statistical data release as one of the strongest privacy guarantees. Releasing synthetic data that mimic original data with differential privacy provides a promising way for privacy preserving data sharing and analytics while providing a rigorous privacy guarantee. However, to this date there is no open-source tools that allow users to generate differentially private synthetic data, in particular, for high dimensional and large domain data. Most of the existing techniques that generate differentially private histograms or synthetic data only work well for single dimensional or low-dimensional histograms. They become problematic for high dimensional and large domain data due to increased perturbation error and computation complexity. We propose DPSynthesizer, a toolkit for differentially private data synthesization. The core of DPSynthesizer is DPCopula designed for high-dimensional and large-domain data. DPCopula computes a differentially private copula function from which synthetic data can be sampled. Copula functions are used to describe the dependence between multivariate random vectors and allow us to build the multivariate joint distribution using one-dimensional marginal distributions. DPSynthesizer also implements a set of state-of-the-art methods for building differentially private histograms, suitable for low-dimensional data, from which synthetic data can be generated. We will demonstrate the system using DPCopula as well as other methods with various data sets and show the feasibility, utility, and efficiency of various methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kong:2014:SLS, author = "Longbo Kong and Zhi Liu and Yan Huang", title = "{SPOT}: locating social media users based on social network context", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1681--1684", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A tremendous amount of information is being shared everyday on social media sites such as Facebook, Twitter or Google+. But only a small portion of users provide their location information, which can be helpful in targeted advertisement and many other services. In this demo we present our large scale user location estimation system, SPOT, which showcase different location estimating models on real world data sets. The demo shows three different location estimation algorithms: a friend-based, a social closeness-based, and an energy and local social coefficient based. The first algorithm is a baseline and the other two new algorithms utilize social closeness information which was traditionally treated as a binary friendship. The two algorithms are based on the premise that friends are different and close friends can help to estimate location better. The demo will also show that all three algorithms benefit from a confidence-based iteration method. The demo is web-based. A user can specify different settings, explore the estimation results on a map, and observe the statistical information, e.g. accuracy and average friends used in the estimation, dynamically. The demo provides two datasets: Twitter (148,860 located users) and Gowalla (99,563 located users). Furthermore, a user can filter users with certain features, e.g. with more than 100 friends, to see how the estimating models work on a particular case. The estimated and real locations of those users as well as their friends will be displayed on the map.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Alavi:2014:RQE, author = "Zohreh Alavi and Lu Zhou and James Powers and Keke Chen", title = "{RASP-QS}: efficient and confidential query services in the cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1685--1688", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Hosting data query services in public clouds is an attractive solution for its great scalability and significant cost savings. However, data owners also have concerns on data privacy due to the lost control of the infrastructure. This demonstration shows a prototype for efficient and confidential range/kNN query services built on top of the random space perturbation (RASP) method. The RASP approach provides a privacy guarantee practical to the setting of cloud-based computing, while enabling much faster query processing compared to the encryption-based approach. This demonstration will allow users to more intuitively understand the technical merits of the RASP approach via interactive exploration of the visual interface.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kunjir:2014:TTM, author = "Mayuresh Kunjir and Prajakta Kalmegh and Shivnath Babu", title = "{Thoth}: towards managing a multi-system cluster", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1689--1692", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Following the 'no one size fits all' philosophy, active research in big data platforms is focusing on creating an environment for multiple 'one-size' systems to co-exist and cooperate in the same cluster. Consequently, it has now become imperative to provide an integrated management solution that provides a database-centric view of the underlying multi-system environment. We outline the proposal of DBMS$^+$, a database management platform over multiple 'one-size' systems. Our prototype implementation of DBMS$^+$, called Thoth, adaptively chooses a best-fit system based on application requirements. In this demonstration, we propose to showcase Thoth DM, a data management framework for Thoth which consists of a data collection pipeline utility, data consolidation and dispatcher module, and a warehouse for storing this data. We further introduce the notion of apps; an app is a utility that registers with Thoth DM and interfaces with its warehouse to provide core database management functionalities like dynamic provisioning of resources, designing a multi-system-aware optimizer, tuning of configuration parameters on each system, data storage, and layout schemes. We will demonstrate Thoth DM in action over Hive, Hadoop, Shark, Spark, and the Hadoop Distributed File System. This demonstration will focus on the following apps: (i) Dashboard for administration and control that will let the audience monitor and visualize a database-centric view of the multi-system cluster, and (ii) Data Layout Recommender app will allow searching for the optimal data layout in the multi-system setting.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2014:XLC, author = "Lei Zhang and Achim Rettinger", title = "{X-LiSA}: cross-lingual semantic annotation", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1693--1696", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The ever-increasing quantities of structured knowledge on the Web and the impending need of multilinguality and cross-linguality for information access pose new challenges but at the same time open up new opportunities for knowledge extraction research. In this regard, cross-lingual semantic annotation has emerged as a topic of major interest and it is essential to build tools that can link words and phrases in unstructured text in one language to resources in structured knowledge bases in any other language. In this paper, we demonstrate X-LiSA, an infrastructure for cross-lingual semantic annotation, which supports both service-oriented and user-oriented interfaces for annotating text documents and web pages in different languages using resources from Wikipedia and Linked Open Data (LOD).", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jayachandran:2014:CUI, author = "Prasanth Jayachandran and Karthik Tunga and Niranjan Kamat and Arnab Nandi", title = "Combining user interaction, speculative query execution and sampling in the {DICE} system", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1697--1700", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The interactive exploration of data cubes has become a popular application, especially over large datasets. In this paper, we present DICE, a combination of a novel frontend query interface and distributed aggregation backend that enables interactive cube exploration. DICE provides a convenient, practical alternative to the typical offline cube materialization strategy by allowing the user to explore facets of the data cube, trading off accuracy for interactive response-times, by sampling the data. We consider the time spent by the user perusing the results of their current query as an opportunity to execute and cache the most likely followup queries. The frontend presents a novel intuitive interface that allows for sampling-aware aggregations, and encourages interaction via our proposed faceted model. The design of our backend is tailored towards the low-latency user interaction at the frontend, and vice-versa. We discuss the synergistic design behind both the frontend user experience and the backend architecture of DICE; and, present a demonstration that allows the user to fluidly interact with billion-tuple datasets within sub-second interactive response times.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Su:2014:SSM, author = "Han Su and Kai Zheng and Kai Zeng and Jiamin Huang and Xiaofang Zhou", title = "{STMaker}: a system to make sense of trajectory data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1701--1704", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Widely adoption of GPS-enabled devices generates large amounts of trajectories every day. The raw trajectory data describes the movement history of moving objects by a sequence of (longitude, latitude, time-stamp) triples, which are nonintuitive for human to perceive the prominent features of the trajectory, such as where and how the moving object travels. In this demo, we present the STMaker system to help users make sense of individual trajectories. Given a trajectory, STMaker can automatically extract the significant semantic behavior of the trajectory, and summarize the behavior by a short human-readable text. In this paper, we first introduce the phrases of generating trajectory summarizations, and then show several real trajectory summarization cases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jugel:2014:FVA, author = "Uwe Jugel and Zbigniew Jerzak and Gregor Hackenbroich and Volker Markl", title = "Faster visual analytics through pixel-perfect aggregation", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1705--1708", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "State-of-the-art visual data analysis tools ignore bandwidth limitations. They fetch millions of records of high-volume time series data from an underlying RDBMS to eventually draw only a few thousand pixels on the screen. In this work, we demonstrate a pixel-aware big data visualization system that dynamically adapts the number of data points transmitted and thus the data rate, while preserving pixel-perfect visualizations. We show how to carefully select the data points to fetch for each pixel of a visualization, using a visualization-driven data aggregation that models the visualization process. Defining all required data reduction operators at the query level, our system trades off a few milliseconds of query execution time for dozens of seconds of data transfer time. The results are significantly reduced response times and a near real-time visualization of millions of data points. Using our pixel-aware system, the audience will be able to enjoy the speed and ease of big data visualizations and learn about the scientific background of our system through an interactive evaluation component, allowing the visitor to measure, visualize, and compare competing visualization-related data reduction techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Khan:2014:SBG, author = "Arijit Khan and Sameh Elnikety", title = "Systems for big-graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1709--1710", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graphs have become increasingly important to represent highly-interconnected structures and schema-less data including the World Wide Web, social networks, knowledge graphs, genome and scientific databases, medical and government records. The massive scale of graph data easily overwhelms the main memory and computation resources on commodity servers. In these cases, achieving low latency and high throughput requires partitioning the graph and processing the graph data in parallel across a cluster of servers. However, the software and hardware advances that have worked well for developing parallel databases and scientific applications are not necessarily effective for big-graph problems. Graph processing poses interesting system challenges: graphs represent relationships which are usually irregular and unstructured; and therefore, the computation and data access patterns have poor locality. Hence, the last few years has seen an unprecedented interest in building systems for big-graphs by various communities including databases, systems, semantic web, machine learning, and operations research. In this tutorial, we discuss the design of the emerging systems for processing of big-graphs, key features of distributed graph algorithms, as well as graph partitioning and workload balancing techniques. We emphasize the current challenges and highlight some future research directions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gal:2014:UER, author = "Avigdor Gal", title = "Uncertain entity resolution: re-evaluating entity resolution in the big data era: tutorial", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1711--1712", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Entity resolution is a fundamental problem in data integration dealing with the combination of data from different sources to a unified view of the data. Entity resolution is inherently an uncertain process because the decision to map a set of records to the same entity cannot be made with certainty unless these are identical in all of their attributes or have a common key. In the light of recent advancement in data accumulation, management, and analytics landscape (known as big data) the tutorial re-evaluates the entity resolution process and in particular looks at best ways to handle data veracity. The tutorial ties entity resolution with recent advances in probabilistic database research, focusing on sources of uncertainty in the entity resolution process.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Suchanek:2014:KBA, author = "Fabian M. Suchanek and Gerhard Weikum", title = "Knowledge bases in the age of big data analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1713--1714", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This tutorial gives an overview on state-of-the-art methods for the automatic construction of large knowledge bases and harnessing them for data and text analytics. It covers both big-data methods for building knowledge bases and knowledge bases being assets for big-data applications. The tutorial also points out challenges and research opportunities.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Meliou:2014:CED, author = "Alexandra Meliou and Sudeepa Roy and Dan Suciu", title = "Causality and explanations in databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1715--1716", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the surge in the availability of information, there is a great demand for tools that assist users in understanding their data. While today's exploration tools rely mostly on data visualization, users often want to go deeper and understand the underlying causes of a particular observation. This tutorial surveys research on causality and explanation for data-oriented applications. We will review and summarize the research thus far into causality and explanation in the database and AI communities, giving researchers a snapshot of the current state of the art on this topic, and propose a unified framework as well as directions for future research. We will cover both the theory of causality/explanation and some applications; we also discuss the connections with other topics in database research like provenance, deletion propagation, why-not queries, and OLAP techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2014:ESB, author = "Yunyao Li and Ziyang Liu and Huaiyu Zhu", title = "Enterprise search in the big data era: recent developments and open challenges", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1717--1718", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Enterprise search allows users in an enterprise to retrieve desired information through a simple search interface. It is widely viewed as an important productivity tool within an enterprise. While Internet search engines have been highly successful, enterprise search remains notoriously challenging due to a variety of unique challenges, and is being made more so by the increasing heterogeneity and volume of enterprise data. On the other hand, enterprise search also presents opportunities to succeed in ways beyond current Internet search capabilities. This tutorial presents an organized overview of these challenges and opportunities, and reviews the state-of-the-art techniques for building a reliable and high quality enterprise search engine, in the context of the rise of big data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2014:VPD, author = "Yunyao Li and Erich Neuhold", title = "{VLDB 2014} {Ph.D.} workshop: an overview", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1719--1719", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The VLDB 2014 PhD Workshop is an one-day event to be held in Hangzhou, China on September 1st, 2014, in conjunction with VLDB 2014. The aim of this workshop is to provide helpful feedback, useful information and networking opportunities that can benefit the students' dissertation work as well as their long-term career. The selection process and the workshop program were carefully designed with this specific goal in mind. The accepted submissions are included in the online proceedings for the Workshop at \ur{http://www.vldb.org/2014/phd_workshop_proceedings.html}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Venkataraman:2014:DCG, author = "Shivakumar Venkataraman and Divyakant Agrawal", title = "Datacenters as computers: {Google} engineering \& database research perspectives", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1720--1721", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this collaborative keynote address, we will share Google's experience in building a scalable data infrastructure that leverages datacenters for managing Google's advertising data over the last decade. In order to support the massive online advertising platform at Google, the data infrastructure must simultaneously support both transactional and analytical workloads. The focus of this talk will be to highlight how the datacenter architecture and the cloud computing paradigm has enabled us to manage the exponential growth in data volumes and user queries, make our services highly available and fault tolerant to massive datacenter outages, and deliver results with very low latencies. We note that other Internet companies have also undergone similar growth in data volumes and user queries. In fact, this phenomenon has resulted in at least two new terms in the technology lexicon: big data and cloud computing. Cloud computing (and datacenters) have been largely responsible for scaling the data volumes from terabytes range just a few years ago to now reaching in the exabyte range over the next couple of years. Delivering solutions at this scale that are fault-tolerant, latency sensitive, and highly available requires a combination of research advances with engineering ingenuity at Google and elsewhere. Next, we will try to answer the following question: is a datacenter just another (very large) computer? Or, does it fundamentally change the design principles for data-centric applications and systems. We will conclude with some of the unique research challenges that need to be addressed in order to sustain continuous growth in data volumes while supporting high throughput and low latencies.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Plattner:2014:ICM, author = "Hasso Plattner", title = "The impact of columnar in-memory databases on enterprise systems: implications of eliminating transaction-maintained aggregates", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1722--1729", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Five years ago I proposed a common database approach for transaction processing and analytical systems using a columnar in-memory database, disputing the common belief that column stores are not suitable for transactional workloads. Today, the concept has been widely adopted in academia and industry and it is proven that it is feasible to run analytical queries on large data sets directly on a redundancy-free schema, eliminating the need to maintain pre-built aggregate tables during data entry transactions. The resulting reduction in transaction complexity leads to a dramatic simplification of data models and applications, redefining the way we build enterprise systems. First analyses of productive applications adopting this concept confirm that system architectures enabled by in-memory column stores are conceptually superior for business transaction processing compared to row-based approaches. Additionally, our analyses show a shift of enterprise workloads to even more read-oriented processing due to the elimination of updates of transaction-maintained aggregates.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Markl:2014:BCD, author = "Volker Markl", title = "Breaking the chains: on declarative data analysis and data independence in the big data era", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1730--1733", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data management research, systems, and technologies have drastically improved the availability of data analysis capabilities, particularly for non-experts, due in part to low-entry barriers and reduced ownership costs (e.g., for data management infrastructures and applications). Major reasons for the widespread success of database systems and today's multi-billion dollar data management market include data independence, separating physical representation and storage from the actual information, and declarative languages, separating the program specification from its intended execution environment. In contrast, today's big data solutions do not offer data independence and declarative specification. As a result, big data technologies are mostly employed in newly-established companies with IT-savvy employees or in large well-established companies with big IT departments. We argue that current big data solutions will continue to fall short of widespread adoption, due to usability problems, despite the fact that in-situ data analytics technologies achieve a good degree of schema independence. In particular, we consider the lack of a declarative specification to be a major road-block, contributing to the scarcity in available data scientists available and limiting the application of big data to the IT-savvy industries. In particular, data scientists currently have to spend a lot of time on tuning their data analysis programs for specific data characteristics and a specific execution environment. We believe that the research community needs to bring the powerful concepts of declarative specification to current data analysis systems, in order to achieve the broad big data technology adoption and effectively deliver the promise that novel big data technologies offer.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Neumann:2014:EHP, author = "Thomas Neumann", title = "Engineering high-performance database engines", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1734--1741", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Developing a database engine is both challenging and rewarding. Database engines are very complex software artifacts that have to scale to large data sizes and large hardware configurations, and developing such systems usually means choosing between different trade-offs at various points of development. This papers gives a survey over two different database engines, the disk-based SPARQL-processing engine RDF-3X, and the relational main-memory engine HyPer. It discusses the design choices that were made during development, and highlights optimization techniques that are important for both systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2014:RLC, author = "Wei Cao and Feng Yu and Jiasen Xie", title = "Realization of the low cost and high performance {MySQL} cloud database", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1742--1747", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "MySQL is a low cost, high performance, good reliability and open source database product, widely used in many Internet companies. For example, there are thousands of MySQL servers being used in Taobao. Although NoSQL developed very quickly in past two years, and new products emerged in endlessly, but in the actual business application of NoSQL, the requirements to developers are relatively high. Moreover, MySQL has many more mature middleware, maintenance tools and a benign ecological circle, so from this perspective, MySQL dominates in the whole situation, while NoSQL is as a supplement. We (the core system database team of Taobao) have done a lot of work in the field of MySQL hosting platform, designed and implemented a UMP (Unified MySQL Platform) system, to provide a low cost and high performance MySQL cloud database service.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qin:2014:FCS, author = "An Qin and Dianming Hu and Jun Liu and Wenjun Yang and Dai Tan", title = "{Fatman}: cost-saving and reliable archival storage based on volunteer resources", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1748--1753", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present Fatman, an enterprise-scale archival storage based on volunteer contribution resources from underutilized web servers, usually deployed on thousands of nodes with spare storage capacity. Fatman is specifically designed for enhancing the utilization of existing storage resources and cutting down the hardware purchase cost. Two major concerned issues of the system design are maximizing the resource utilization of volunteer nodes without violating Service Level Objectives (SLOs) and minimizing the cost without reducing the availability of archival system. Fatman has been widely deployed on tens of thousands of server nodes across several datacenters, provided more than 100PB storage capacity and served dozens of internal mass-data applications. The system realizes an efficient storage quota consolidation by strong isolation and budget limitation, to maximally support resources contribution without any degradation on host-level SLOs. It firstly improves data reliability by applying disk failure prediction to diminish failure recovery cost, named fault-aware data management, dramatically reduces the MTTR by 76.3\% and decreases file crash ratio by 35\% on real-life product workload.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2014:DIR, author = "Shiming Zhang and Yin Yang and Wei Fan and Marianne Winslett", title = "Design and implementation of a real-time interactive analytics system for large spatio-temporal data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1754--1759", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In real-time interactive data analytics, the user expects to receive the results of each query within a short time period such as seconds. This is especially challenging when the data is big (e.g., on the scale of petabytes), and the analytics system runs on top of cloud infrastructure (e.g., thousands of interconnected commodity servers). We have been building such a system, called OceanRT, for managing large spatio-temporal data such as call logs and mobile web browsing records collected by a telecommunication company. Although there already exist systems for querying big data in real time, OceanRT's performance stands out due to several novel designs and components that address key efficiency and scalability issues that were largely overlooked in existing systems. First, OceanRT makes extensive use of software RDMA one-sided operations, which reduce networking costs without requiring specialized hardware. Second, OceanRT exploits the parallel computing capabilities of each node in the cloud through a novel architecture consisting of Access-Query Engines (AQEs) connected with minimal overhead. Third, OceanRT contains a novel storage scheme that optimizes for queries with joins and multi-dimensional selections, which are common for large spatio-temporal data. Experiments using the TPC-DS benchmark show that OceanRT is usually more than an order of magnitude faster than the current state-of-the-art systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dai:2014:PRS, author = "Chaoyue Dai and Feng Qian and Wei Jiang and Zhoutian Wang and Zenghong Wu", title = "A personalized recommendation system for {NetEase} dating site", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1760--1765", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the popularity of internet, more and more people try to find friends or dating partners on online dating web sites. Recommending appropriate partners from a large amount of candidates becomes an interesting and challenging problem in the field of recommendation system. Various types of recommendation techniques (e.g., content based recommendation, collaborative filtering and association rule mining) have be proposed to tackle this problem. However most of them ignore the personalization concerns that they (1) mainly consider the hot users or frequent items, (2) cover only a portion of users especially ignoring the long tails, (3) and cannot deal with the cold start problem properly. In this paper, we present a regression based hybrid recommendation system that makes use of matching degree, fancy degree, activity, sincerity, popularity and enthusiasm, to recommend appropriate partners. The experimental evaluation of our recommendation system on a real dating web site shows our strategy is more effective and efficient than its previous version which follows the principle of giving higher priority to the recent active users.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ling:2014:GIH, author = "Zheng Jye Ling and Quoc Trung Tran and Ju Fan and Gerald C. H. Koh and Thi Nguyen and Chuen Seng Tan and James W. L. Yip and Meihui Zhang", title = "{GEMINI}: an integrative healthcare analytics system", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1766--1771", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Healthcare systems around the world are facing the challenge of information overload in caring for patients in an affordable, safe and high-quality manner in a system with limited healthcare resources and increasing costs. To alleviate this problem, we develop an integrative healthcare analytics system called GEMINI which allows point of care analytics for doctors where real-time usable and relevant information of their patients are required through the questions they asked about the patients they are caring for. GEMINI extracts data of each patient from various data sources and stores them as information in a patient profile graph. The data sources are complex and varied consisting of both structured data (such as, patients' demographic data, laboratory results and medications) and unstructured data (such as, doctors' notes). Hence, the patient profile graph provides a holistic and comprehensive information of patients' healthcare profile, from which GEMINI can infer implicit information useful for administrative and clinical purposes, and extract relevant information for performing predictive analytics. At the core, GEMINI keeps interacting with the healthcare professionals as part of a feedback loop to gather, infer, ascertain and enhance the self-learning knowledge base. We present a case study on using GEMINI to predict the risk of unplanned patient readmissions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zou:2014:MTD, author = "Yongqiang Zou and Xing Jin and Yi Li and Zhimao Guo and Eryu Wang and Bin Xiao", title = "{Mariana}: {Tencent} deep learning platform and its applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1772--1777", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Deep learning gains lots of attentions in recent years and is more and more important for mining values in big data. However, to make deep learning practical for a wide range of applications in Tencent Inc., three requirements must be considered: (1) Lots of computational power are required to train a practical model with tens of millions of parameters and billions of samples for products such as automatic speech recognition (ASR), and the number of parameters and training data is still growing. (2) The capability of training larger model is necessary for better model quality. (3) Easy to use frameworks are valuable to do many experiments to perform model selection, such as finding an appropriate optimization algorithm and tuning optimal hyper-parameters. To accelerate training, support large models, and make experiments easier, we built Mariana, the Tencent deep learning platform, which utilizes GPU and CPU cluster to train models parallelly with three frameworks: (1) a multi-GPU data parallelism framework for deep neural networks (DNNs). (2) a multi-GPU model parallelism and data parallelism framework for deep convolutional neural networks (CNNs). (3) a CPU cluster framework for large scale DNNs. Mariana also provides built-in algorithms and features to facilitate experiments. Mariana is in production usage for more than one year, achieves state-of-the-art acceleration performance, and plays a key role in training models and improving quality for automatic speech recognition and image recognition in Tencent WeChat, a mobile social platform, and for Ad click-through rate prediction (pCTR) in Tencent QQ, an instant messaging platform, and Tencent Qzone, a social networking service.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2014:YPC, author = "Sai Wu and Chun Chen and Gang Chen and Ke Chen and Lidan Shou and Hui Cao and He Bai", title = "{YZStack}: provisioning customizable solution for big data", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1778--1783", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "YZStack is our developing solution which implements many well-established big data techniques as selectable modules and allows users to customize their systems as a process of module selection. In particular, it includes an openstack based IaaS (Infrastructure as a Service) layer, a distributed file system based DaaS (Data as a Service) layer, a PaaS (Platform as a Service) layer equipped with parallel processing techniques and a SaaS (Software as a Service) layer with popular data analytic algorithms. Layers of YZStack are loosely connected, so that customization of one layer does not affect the other layers and their interactions. In this paper, we use a smart financial system developed for the Zhejiang Provincial Department of Finance to demonstrate how to leverage YZStack to speed up the implementation of big data system. We also introduce two popular applications of the financial system, economic prediction and detection of improper payment.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Klonatos:2014:EBE, author = "Yannis Klonatos and Christoph Koch and Tiark Rompf and Hassan Chafi", title = "Errata for {``Building efficient query engines in a high-level language'': PVLDB {\bf 7}(10):853--864}", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "13", pages = "1784--1784", month = aug, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:31 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", note = "See \cite{Klonatos:2014:BEQ}.", abstract = "This is in response to recent feedback from our peers that calls for a number of clarifications regarding the experimental section of our paper.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lu:2014:SMM, author = "Wei Lu and Shanshan Chen and Keqian Li and Laks V. S. Lakshmanan", title = "Show me the money: dynamic recommendations for revenue maximization", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1785--1796", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recommender Systems (RS) play a vital role in applications such as e-commerce and on-demand content streaming. Research on RS has mainly focused on the customer perspective, i.e., accurate prediction of user preferences and maximization of user utilities. As a result, most existing techniques are not explicitly built for revenue maximization, the primary business goal of enterprises. In this work, we explore and exploit a novel connection between RS and the profitability of a business. As recommendations can be seen as an information channel between a business and its customers, it is interesting and important to investigate how to make strategic dynamic recommendations leading to maximum possible revenue. To this end, we propose a novel revenue model that takes into account a variety of factors including prices, valuations, saturation effects, and competition amongst products. Under this model, we study the problem of finding revenue-maximizing recommendation strategies over a finite time horizon. We show that this problem is NP-hard, but approximation guarantees can be obtained for a slightly relaxed version, by establishing an elegant connection to matroid theory. Given the prohibitively high complexity of the approximation algorithm, we also design intelligent heuristics for the original problem. Finally, we conduct extensive experiments on two real and synthetic datasets and demonstrate the efficiency, scalability, and effectiveness our algorithms, and that they significantly outperform several intuitive baselines.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lu:2014:SSG, author = "Peng Lu and Gang Chen and Beng Chin Ooi and Hoang Tam Vo and Sai Wu", title = "{ScalaGiST}: scalable generalized search trees for {MapReduce} systems [innovative systems paper]", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1797--1808", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "MapReduce has become the state-of-the-art for data parallel processing. Nevertheless, Hadoop, an open-source equivalent of MapReduce, has been noted to have sub-optimal performance in the database context since it is initially designed to operate on raw data without utilizing any type of indexes. To alleviate the problem, we present ScalaGiST --- scalable generalized search tree that can be seamlessly integrated with Hadoop, together with a cost-based data access optimizer for efficient query processing at run-time. ScalaGiST provides extensibility in terms of data and query types, hence is able to support unconventional queries (e.g., multi-dimensional range and $k$-NN queries) in MapReduce systems, and can be dynamically deployed in large cluster environments for handling big users and data. We have built ScalaGiST and demonstrated that it can be easily instantiated to common B$^+$ -tree and R-tree indexes yet for dynamic distributed environments. Our extensive performance study shows that ScalaGiST can provide efficient write and read performance, elastic scaling property, as well as effective support for MapReduce execution of ad-hoc analytic queries. Performance comparisons with recent proposals of specialized distributed index structures, such as SpatialHadoop, Data Mapping, and RT-CAN further confirm its efficiency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2014:FPK, author = "Mohan Yang and Bolin Ding and Surajit Chaudhuri and Kaushik Chakrabarti", title = "Finding patterns in a knowledge base using keywords to compose table answers", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1809--1820", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We aim to provide table answers to keyword queries using a knowledge base. For queries referring to multiple entities, like ``Washington cities population'' and ``Mel Gibson movies'', it is better to represent each relevant answer as a table which aggregates a set of entities or joins of entities within the same table scheme or pattern. In this paper, we study how to find highly relevant patterns in a knowledge base for user-given keyword queries to compose table answers. A knowledge base is modeled as a directed graph called knowledge graph, where nodes represent its entities and edges represent the relationships among them. Each node/edge is labeled with type and text. A pattern is an aggregation of subtrees which contain all keywords in the texts and have the same structure and types on node/edges. We propose efficient algorithms to find patterns that are relevant to the query for a class of scoring functions. We show the hardness of the problem in theory, and propose path-based indexes that are affordable in memory. Two query-processing algorithms are proposed: one is fast in practice for small queries (with small numbers of patterns as answers) by utilizing the indexes; and the other one is better in theory, with running time linear in the sizes of indexes and answers, which can handle large queries better. We also conduct extensive experimental study to compare our approaches with a naive adaption of known techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yan:2014:PAG, author = "Da Yan and James Cheng and Kai Xing and Yi Lu and Wilfred Ng and Yingyi Bu", title = "{Pregel} algorithms for graph connectivity problems with performance guarantees", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1821--1832", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graphs in real life applications are often huge, such as the Web graph and various social networks. These massive graphs are often stored and processed in distributed sites. In this paper, we study graph algorithms that adopt Google's Pregel, an iterative vertex-centric framework for graph processing in the Cloud. We first identify a set of desirable properties of an efficient Pregel algorithm, such as linear space, communication and computation cost per iteration, and logarithmic number of iterations. We define such an algorithm as a practical Pregel algorithm (PPA). We then propose PPAs for computing connected components (CCs), biconnected components (BCCs) and strongly connected components (SCCs). The PPAs for computing BCCs and SCCs use the PPAs of many fundamental graph problems as building blocks, which are of interest by themselves. Extensive experiments over large real graphs verified the efficiency of our algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shang:2014:AAG, author = "Zechao Shang and Jeffrey Xu Yu", title = "Auto-approximation of graph computing", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1833--1844", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the big data era, graph computing is one of the challenging issues because there are numerous large graph datasets emerging from real applications. A question is: do we need to know the final exact answer for a large graph? When it is impossible to know the exact answer in a limited time, is it possible to approximate the final answer in an automatic and systematic way without having to designing new approximate algorithms? The main idea behind the question is: it is more important to find out something meaningful quick from a large graph, and we should focus on finding a way of making use of large graphs instead of spending time on designing approximate algorithms. In this paper, we give an innovative approach which automatically and systematically synthesizes a program to approximate the original program. We show that we can give users some answers with reasonable accuracy and high efficiency for a wide spectrum of graph algorithms, without having to know the details of graph algorithms. We have conducted extensive experimental studies using many graph algorithms that are supported in the existing graph systems and large real graphs. Our extensive experimental results reveal that our automatically approximating approach is highly feasible.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Furche:2014:DTW, author = "Tim Furche and Georg Gottlob and Giovanni Grasso and Xiaonan Guo and Giorgio Orsi and Christian Schallhart and Cheng Wang", title = "{DIADEM}: thousands of websites to a single database", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1845--1856", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The web is overflowing with implicitly structured data, spread over hundreds of thousands of sites, hidden deep behind search forms, or siloed in marketplaces, only accessible as HTML. Automatic extraction of structured data at the scale of thousands of websites has long proven elusive, despite its central role in the ``web of data''. Through an extensive evaluation spanning over 10000 web sites from multiple application domains, we show that automatic, yet accurate full-site extraction is no longer a distant dream. diadem is the first automatic full-site extraction system that is able to extract structured data from different domains at very high accuracy. It combines automated exploration of websites, identification of relevant data, and induction of exhaustive wrappers. Automating these components is the first challenge. diadem overcomes this challenge by combining phenomenological and ontological knowledge. Integrating these components is the second challenge. diadem overcomes this challenge through a self-adaptive network of relational transducers that produces effective wrappers for a wide variety of websites. Our extensive and publicly available evaluation shows that, for more than 90\% of sites from three domains, diadem obtains an effective wrapper that extracts all relevant data with 97\% average precision. diadem also tolerates noisy entity recognisers, and its components individually outperform comparable approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2014:UAQ, author = "Wentao Wu and Xi Wu and Hakan Hacig{\"u}m{\"u}s and Jeffrey F. Naughton", title = "Uncertainty aware query execution time prediction", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1857--1868", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Predicting query execution time is a fundamental issue underlying many database management tasks. Existing predictors rely on information such as cardinality estimates and system performance constants that are difficult to know exactly. As a result, accurate prediction still remains elusive for many queries. However, existing predictors provide a single, point estimate of the true execution time, but fail to characterize the uncertainty in the prediction. In this paper, we take a first step towards providing uncertainty information along with query execution time predictions. We use the query optimizer's cost model to represent the query execution time as a function of the selectivities of operators in the query plan as well as the constants that describe the cost of CPU and I/O operations in the system. By treating these quantities as random variables rather than constants, we show that with low overhead we can infer the distribution of likely prediction errors. We further show that the estimated prediction errors by our proposed techniques are strongly correlated with the actual prediction errors.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Konstantinidis:2014:OCS, author = "George Konstantinidis and Jos{\'e} Luis Ambite", title = "Optimizing the chase: scalable data integration under constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1869--1880", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We are interested in scalable data integration and data exchange under constraints/dependencies. In data exchange the problem is how to materialize a target database instance, satisfying the source-to-target and target dependencies, that provides the certain answers. In data integration, the problem is how to rewrite a query over the target schema into a query over the source schemas that provides the certain answers. In both these problems we make use of the chase algorithm, the main tool to reason with dependencies. Our first contribution is to introduce the frugal chase, which produces smaller universal solutions than the standard chase, still remaining polynomial in data complexity. Our second contribution is to use the frugal chase to scale up query answering using views under LAV weakly acyclic target constraints, a useful language capturing RDF/S. The latter problem can be reduced to query rewriting using views without constraints by chasing the source-to-target mappings with the target constraints. We construct a compact graph-based representation of the mappings and the constraints and develop an efficient algorithm to run the frugal chase on this representation. We show experimentally that our approach scales to large problems, speeding up the compilation of the dependencies into the mappings by close to 2 and 3 orders of magnitude, compared to the standard and the core chase, respectively. Compared to the standard chase, we improve online query rewriting time by a factor of 3, while producing equivalent, but smaller, rewritings of the original query.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Athanassoulis:2014:BTA, author = "Manos Athanassoulis and Anastasia Ailamaki", title = "{BF}-tree: approximate tree indexing", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1881--1892", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The increasing volume of time-based generated data and the shift in storage technologies suggest that we might need to reconsider indexing. Several workloads --- like social and service monitoring --- often include attributes with implicit clustering because of their time-dependent nature. In addition, solid state disks (SSD) (using flash or other low-level technologies) emerge as viable competitors of hard disk drives (HDD). Capacity and access times of storage devices create a trade-off between SSD and HDD. Slow random accesses in HDD have been replaced by efficient random accesses in SSD, but their available capacity is one or more orders of magnitude more expensive than the one of HDD. Indexing, however, is designed assuming HDD as secondary storage, thus minimizing random accesses at the expense of capacity. Indexing data using SSD as secondary storage requires treating capacity as a scarce resource. To this end, we introduce approximate tree indexing, which employs probabilistic data structures (Bloom filters) to trade accuracy for size and produce smaller, yet powerful, tree indexes, which we name Bloom filter trees (BF-Trees). BF-Trees exploit pre-existing data ordering or partitioning to offer competitive search performance. We demonstrate, both by an analytical study and by experimental results, that by using workload knowledge and reducing indexing accuracy up to some extent, we can save substantially on capacity when indexing on ordered or partitioned attributes. In particular, in experiments with a synthetic workload, approximate indexing offers 2.22x-48x smaller index footprint with competitive response times, and in experiments with TPCH and a monitoring real-life dataset from an energy company, it offers 1.6x-4x smaller index footprint with competitive search times as well.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tozun:2014:AAI, author = "Pinar T{\"o}z{\"u}n and Islam Atta and Anastasia Ailamaki and Andreas Moshovos", title = "{ADDICT}: advanced instruction chasing for transactions", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1893--1904", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent studies highlight that traditional transaction processing systems utilize the micro-architectural features of modern processors very poorly. L1 instruction cache and long-latency data misses dominate execution time. As a result, more than half of the execution cycles are wasted on memory stalls. Previous works on reducing stall time aim at improving locality through either hardware or software techniques. However, exploiting hardware resources based on the hints given by the software-side has not been widely studied for data management systems. In this paper, we observe that, independently of their high-level functionality, transactions running in parallel on a multicore system execute actions chosen from a limited sub-set of predefined database operations. Therefore, we initially perform a memory characterization study of modern transaction processing systems using standardized benchmarks. The analysis demonstrates that same-type transactions exhibit at most 6\% overlap in their data footprints whereas there is up to 98\% overlap in instructions. Based on the findings, we design ADDICT, a transaction scheduling mechanism that aims at maximizing the instruction cache locality. ADDICT determines the most frequent actions of database operations, whose instruction footprint can fit in an L1 instruction cache, and assigns a core to execute each of these actions. Then, it schedules each action on its corresponding core. Our prototype implementation of ADDICT reduces L1 instruction misses by 85\% and the long latency data misses by 20\%. As a result, ADDICT leads up to a 50\% reduction in the total execution time for the evaluated workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Alsubaiee:2014:ASO, author = "Sattam Alsubaiee and Yasser Altowim and Hotham Altwaijry and Alexander Behm and Vinayak Borkar and Yingyi Bu and Michael Carey and Inci Cetindil and Madhusudan Cheelangi and Khurram Faraaz and Eugenia Gabrielova and Raman Grover and Zachary Heilbron and Young-Seok Kim and Chen Li and Guangqiang Li and Ji Mahn Ok and Nicola Onose and Pouria Pirzadeh and Vassilis Tsotras and Rares Vernica and Jian Wen and Till Westmann", title = "{AsterixDB}: a scalable, open source {BDMS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1905--1916", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "AsterixDB is a new, full-function BDMS (Big Data Management System) with a feature set that distinguishes it from other platforms in today's open source Big Data ecosystem. Its features make it well-suited to applications like web data warehousing, social data storage and analysis, and other use cases related to Big Data. AsterixDB has a flexible NoSQL style data model; a query language that supports a wide range of queries; a scalable runtime; partitioned, LSM-based data storage and indexing (including B$^+$-tree, R-tree, and text indexes); support for external as well as natively stored data; a rich set of built-in types; support for fuzzy, spatial, and temporal types and queries; a built-in notion of data feeds for ingestion of data; and transaction support akin to that of a NoSQL store. Development of AsterixDB began in 2009 and led to a mid-2013 initial open source release. This paper is the first complete description of the resulting open source AsterixDB system. Covered herein are the system's data model, its query language, and its software architecture. Also included are a summary of the current status of the project and a first glimpse into how AsterixDB performs when compared to alternative technologies, including a parallel relational DBMS, a popular NoSQL store, and a popular Hadoop-based SQL data analytics platform, for things that both technologies can do. Also included is a brief description of some initial trials that the system has undergone and the lessons learned (and plans laid) based on those early ``customer'' engagements.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xu:2014:LLB, author = "Ning Xu and Lei Chen and Bin Cui", title = "{LogGP}: a log-based dynamic graph partitioning method", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1917--1928", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the increasing availability and scale of graph data from Web 2.0, graph partitioning becomes one of efficient preprocessing techniques to balance the computing workload. Since the cost of partitioning the entire graph is strictly prohibitive, there are some recent tentative works towards streaming graph partitioning which can run faster, be easily paralleled, and be incrementally updated. Unfortunately, the experiments show that the running time of each partitioning is still unbalanced due to the variation of workload access patterns during the supersteps. In addition, the one-pass streaming partitioning result is not always satisfactory for the algorithms' local view of the graph. In this paper, we present LogGP, a log-based graph partitioning system that records, analyzes and reuses the historical statistical information to refine the partitioning result. LogGP can be used as a middle-ware and deployed to many state-of-the-art paralleled graph processing systems easily. LogGP utilizes the historical partitioning results to generate a hyper-graph and uses a novel hyper-graph streaming partitioning approach to generate a better initial streaming graph partitioning result. During the execution, the system uses running logs to optimize graph partitioning which prevents performance degradation. Moreover, LogGP can dynamically repartition the massive graphs in accordance with the structural changes. Extensive experiments conducted on a moderate size of computing cluster with real-world graph datasets demonstrate the superiority of our approach against the state-of-the-art solutions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Papadakis:2014:SMB, author = "George Papadakis and George Papastefanatos and Georgia Koutrika", title = "Supervised meta-blocking", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1929--1940", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Entity Resolution matches mentions of the same entity. Being an expensive task for large data, its performance can be improved by blocking, i.e., grouping similar entities and comparing only entities in the same group. Blocking improves the run-time of Entity Resolution, but it still involves unnecessary comparisons that limit its performance. Meta-blocking is the process of restructuring a block collection in order to prune such comparisons. Existing unsupervised meta-blocking methods use simple pruning rules, which offer a rather coarse-grained filtering technique that can be conservative (i.e., keeping too many unnecessary comparisons) or aggressive (i.e., pruning good comparisons). In this work, we introduce supervised meta-blocking techniques that learn classification models for distinguishing promising comparisons. For this task, we propose a small set of generic features that combine a low extraction cost with high discriminatory power. We show that supervised meta-blocking can achieve high performance with small training sets that can be manually created. We analytically compare our supervised approaches with baseline and competitor methods over 10 large-scale datasets, both real and synthetic.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xie:2014:GTK, author = "Min Xie and Laks V. S. Lakshmanan and Peter T. Wood", title = "Generating top-$k$ packages via preference elicitation", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1941--1952", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "There are several applications, such as play lists of songs or movies, and shopping carts, where users are interested in finding top-$k$ packages, consisting of sets of items. In response to this need, there has been a recent flurry of activity around extending classical recommender systems (RS), which are effective at recommending individual items, to recommend packages, or sets of items. The few recent proposals for package RS suffer from one of the following drawbacks: they either rely on hard constraints which may be difficult to be specified exactly by the user or on returning Pareto-optimal packages which are too numerous for the user to sift through. To overcome these limitations, we propose an alternative approach for finding personalized top-$k$ packages for users, by capturing users' preferences over packages using a linear utility function which the system learns. Instead of asking a user to specify this function explicitly, which is unrealistic, we explicitly model the uncertainty in the utility function and propose a preference elicitation-based framework for learning the utility function through feedback provided by the user. We propose several sampling-based methods which, given user feedback, can capture the updated utility function. We develop an efficient algorithm for generating top-$k$ packages using the learned utility function, where the rank ordering respects any of a variety of ranking semantics proposed in the literature. Through extensive experiments on both real and synthetic datasets, we demonstrate the efficiency and effectiveness of the proposed system for finding top-$k$ packages.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2014:FRQ, author = "Rui Li and Alex X. Liu and Ann L. Wang and Bezawada Bruhadeshwar", title = "Fast range query processing with strong privacy protection for cloud computing", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1953--1964", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Privacy has been the key road block to cloud computing as clouds may not be fully trusted. This paper concerns the problem of privacy preserving range query processing on clouds. Prior schemes are weak in privacy protection as they cannot achieve index indistinguishability, and therefore allow the cloud to statistically estimate the values of data and queries using domain knowledge and history query results. In this paper, we propose the first range query processing scheme that achieves index indistinguishability under the indistinguishability against chosen keyword attack (IND-CKA). Our key idea is to organize indexing elements in a complete binary tree called PBtree, which satisfies structure indistinguishability (i.e., two sets of data items have the same PBtree structure if and only if the two sets have the same number of data items) and node indistinguishability (i.e., the values of PBtree nodes are completely random and have no statistical meaning). We prove that our scheme is secure under the widely adopted IND-CKA security model. We propose two algorithms, namely PBtree traversal width minimization and PBtree traversal depth minimization, to improve query processing efficiency. We prove that the worse case complexity of our query processing algorithm using PBtree is $ O(| R | \log n) $, where $n$ is the total number of data items and $R$ is the set of data items in the query result. We implemented and evaluated our scheme on a real world data set with 5 million items. For example, for a query whose results contain ten data items, it takes only 0.17 milliseconds.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gao:2014:FTP, author = "Yihan Gao and Aditya Parameswaran", title = "Finish them!: pricing algorithms for human computation", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1965--1976", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a batch of human computation tasks, a commonly ignored aspect is how the price (i.e., the reward paid to human workers) of these tasks must be set or varied in order to meet latency or cost constraints. Often, the price is set up-front and not modified, leading to either a much higher monetary cost than needed (if the price is set too high), or to a much larger latency than expected (if the price is set too low). Leveraging a pricing model from prior work, we develop algorithms to optimally set and then vary price over time in order to meet a (a) user-specified deadline while minimizing total monetary cost (b) user-specified monetary budget constraint while minimizing total elapsed time. We leverage techniques from decision theory (specifically, Markov Decision Processes) for both these problems, and demonstrate that our techniques lead to upto 30\% reduction in cost over schemes proposed in prior work. Furthermore, we develop techniques to speed-up the computation, enabling users to leverage the price setting algorithms on-the-fly.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Catasta:2014:TTC, author = "Michele Catasta and Alberto Tonon and Djellel Eddine Difallah and Gianluca Demartini and Karl Aberer and Philippe Cudre-Mauroux", title = "{TransactiveDB}: tapping into collective human memories", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1977--1980", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database Management Systems (DBMSs) have been rapidly evolving in the recent years, exploring ways to store multi-structured data or to involve human processes during query execution. In this paper, we outline a future avenue for DBMSs supporting transactive memory queries that can only be answered by a collection of individuals connected through a given interaction graph. We present TransactiveDB and its ecosystem, which allow users to pose queries in order to reconstruct collective human memories. We describe a set of new transactive operators including TUnion, TFill, TJoin, and TProjection. We also describe how TransactiveDB leverages transactive operators---by mixing query execution, social network analysis and human computation---in order to effectively and efficiently tap into the memories of all targeted users.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yan:2014:BBC, author = "Da Yan and James Cheng and Yi Lu and Wilfred Ng", title = "{Blogel}: a block-centric framework for distributed computation on real-world graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1981--1992", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The rapid growth in the volume of many real-world graphs (e.g., social networks, web graphs, and spatial networks) has led to the development of various vertex-centric distributed graph computing systems in recent years. However, real-world graphs from different domains have very different characteristics, which often create bottlenecks in vertex-centric parallel graph computation. We identify three such important characteristics from a wide spectrum of real-world graphs, namely (1) skewed degree distribution, (2) large diameter, and (3) (relatively) high density. Among them, only (1) has been studied by existing systems, but many real-world power-law graphs also exhibit the characteristics of (2) and (3). In this paper, we propose a block-centric framework, called Blogel, which naturally handles all the three adverse graph characteristics. Blogel programmers may think like a block and develop efficient algorithms for various graph problems. We propose parallel algorithms to partition an arbitrary graph into blocks efficiently, and block-centric programs are then run over these blocks. Our experiments on large real-world graphs verified that Blogel is able to achieve orders of magnitude performance improvements over the state-of-the-art distributed graph computing systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liagouris:2014:EII, author = "John Liagouris and Manolis Terrovitis", title = "Efficient identification of implicit facts in incomplete {OWL2-EL} knowledge bases", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "1993--2004", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Integrating incomplete and possibly inconsistent data from various sources is a challenge that arises in several application areas, especially in the management of scientific data. A rising trend for data integration is to model the data as axioms in the Web Ontology Language (OWL) and use inference rules to identify new facts. Although there are several approaches that employ OWL for data integration, there is little work on scalable algorithms able to handle large datasets that do not fit in main memory. The main contribution of this paper is an algorithm that allows the effective use of OWL for integrating data in an environment with limited memory. The core idea is to exhaustively apply a set of complex inference rules on large disk-resident datasets. To the best of our knowledge, this is the first work that proposes an I/O-aware algorithm for tackling with such an expressive subset of OWL like the one we address here. Previous approaches considered either simpler models (e.g. RDFS) or main-memory algorithms. In the paper we detail the proposed algorithm, prove its correctness, and experimentally evaluate it on real and synthetic data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2014:WCA, author = "Chen Jason Zhang and Yongxin Tong and Lei Chen", title = "Where to: crowd-aided path selection", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "2005--2016", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the widespread use of geo-positioning services (GPS), GPS-based navigation systems have become ever more of an integral part of our daily lives. GPS-based navigation systems usually suggest multiple paths for any given pair of source and target, leaving users perplexed when trying to select the best one among them, namely the problem of best path selection. Too many suggested paths may jeopardize the usability of the recommendation data, and decrease user satisfaction. Although existing studies have already partially relieved this problem through integrating historical traffic logs or updating traffic conditions periodically, their solutions neglect the potential contribution of human experience. In this paper, we resort to crowdsourcing to ease the pain of the best path selection. The first step of appropriately using the crowd is to ask proper questions. For the best path selection problem, simple questions (e.g. binary voting) over compete paths cannot be directly applied to road networks due to their being too complex for crowd workers. Thus, this paper makes the first contribution by designing two types of questions, namely Routing Query (RQ) and Binary Routing Query (BRQ), to ask the crowd to decide which direction to take at each road intersection. Furthermore, we propose a series of efficient algorithms to dynamically manage the questions in order to reduce the selection hardness within a limited budget. Finally, we compare the proposed methods against two baselines, and the effectiveness and efficiency of our proposals are verified by the results from simulations and experiments on a real-world crowdsourcing platform.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2014:LSR, author = "Yan Huang and Favyen Bastani and Ruoming Jin and Xiaoyang Sean Wang", title = "Large scale real-time ridesharing with service guarantee on road networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "7", number = "14", pages = "2017--2028", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 4 17:20:43 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Urban traffic gridlock is a familiar scene. At the same time, the mean occupancy rate of personal vehicle trips in the United States is only 1.6 persons per vehicle mile. Ridesharing has the potential to solve many environmental, congestion, pollution, and energy problems. In this paper, we introduce the problem of large scale real-time ridesharing with service guarantee on road networks. Trip requests are dynamically matched to vehicles while trip waiting and service time constraints are satisfied. We first propose two scheduling algorithms: a branch-and-bound algorithm and an integer programming algorithm. However, these algorithms do not adapt well to the dynamic nature of the ridesharing problem. Thus, we propose kinetic tree algorithms which are better suited to efficient scheduling of dynamic requests and adjust routes on-the-fly. We perform experiments on a large Shanghai taxi dataset. Results show that the kinetic tree algorithms outperform other algorithms significantly.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sun:2014:SSA, author = "Yifang Sun and Wei Wang and Jianbin Qin and Ying Zhang and Xuemin Lin", title = "{SRS}: solving $c$-approximate nearest neighbor queries in high dimensional {Euclidean} space with a tiny index", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "1", pages = "1--12", month = sep, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:33 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Nearest neighbor searches in high-dimensional space have many important applications in domains such as data mining, and multimedia databases. The problem is challenging due to the phenomenon called ``curse of dimensionality''. An alternative solution is to consider algorithms that returns a $c$-approximate nearest neighbor ($c$-ANN) with guaranteed probabilities. Locality Sensitive Hashing (LSH) is among the most widely adopted method, and it achieves high efficiency both in theory and practice. However, it is known to require an extremely high amount of space for indexing, hence limiting its scalability. In this paper, we propose several surprisingly simple methods to answer $c$-ANN queries with theoretical guarantees requiring only a single tiny index. Our methods are highly flexible and support a variety of functionalities, such as finding the exact nearest neighbor with any given probability. In the experiment, our methods demonstrate superior performance against the state-of-the-art LSH-based methods, and scale up well to 1 billion high-dimensional points on a single commodity PC.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dallachiesa:2014:TKN, author = "Michele Dallachiesa and Themis Palpanas and Ihab F. Ilyas", title = "Top-$k$ nearest neighbor search in uncertain data series", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "1", pages = "13--24", month = sep, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:33 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many real applications consume data that is intrinsically uncertain, noisy and error-prone. In this study, we investigate the problem of finding the top-$k$ nearest neighbors in uncertain data series, which occur in several different domains. We formalize the top-$k$ nearest neighbor problem for uncertain data series, and describe a model for uncertain data series that captures both uncertainty and correlation. This distinguishes our approach from prior work that compromises the accuracy of the model by assuming independence of the value distribution at neighboring time-stamps. We introduce the Holistic-P$k$NN algorithm, which uses novel metric bounds for uncertain series and an efficient refinement strategy to reduce the overall number of required probability estimates. We evaluate our proposal under a variety of settings using a combination of synthetic and 45 real datasets from diverse domains. The results demonstrate the significant advantages of the proposed approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2014:RBP, author = "Jiexing Li and Jeffrey Naughton and Rimma V. Nehme", title = "Resource bricolage for parallel database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "1", pages = "25--36", month = sep, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:33 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Running parallel database systems in an environment with heterogeneous resources has become increasingly common, due to cluster evolution and increasing interest in moving applications into public clouds. For database systems running in a heterogeneous cluster, the default uniform data partitioning strategy may overload some of the slow machines while at the same time it may under-utilize the more powerful machines. Since the processing time of a parallel query is determined by the slowest machine, such an allocation strategy may result in a significant query performance degradation. We take a first step to address this problem by introducing a technique we call resource bricolage that improves database performance in heterogeneous environments. Our approach quantifies the performance differences among machines with various resources as they process workloads with diverse resource requirements. We formalize the problem of minimizing workload execution time and view it as an optimization problem, and then we employ linear programming to obtain a recommended data partitioning scheme. We verify the effectiveness of our technique with an extensive experimental study on a commercial database system.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Graefe:2014:MPB, author = "Goetz Graefe and Haris Volos and Hideaki Kimura and Harumi Kuno and Joseph Tucek and Mark Lillibridge and Alistair Veitch", title = "In-memory performance for big data", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "1", pages = "37--48", month = sep, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:33 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "When a working set fits into memory, the overhead imposed by the buffer pool renders traditional databases non-competitive with in-memory designs that sacrifice the benefits of a buffer pool. However, despite the large memory available with modern hardware, data skew, shifting workloads, and complex mixed workloads make it difficult to guarantee that a working set will fit in memory. Hence, some recent work has focused on enabling in-memory databases to protect performance when the working data set almost fits in memory. Contrary to those prior efforts, we enable buffer pool designs to match in-memory performance while supporting the ``big data'' workloads that continue to require secondary storage, thus providing the best of both worlds. We introduce here a novel buffer pool design that adapts pointer swizzling for references between system objects (as opposed to application objects), and uses it to practically eliminate buffer pool overheads for memoryresident data. Our implementation and experimental evaluation demonstrate that we achieve graceful performance degradation when the working set grows to exceed the buffer pool size, and graceful improvement when the working set shrinks towards and below the memory and buffer pool sizes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Long:2014:TSM, author = "Cheng Long and Raymond Chi-Wing Wong and H. V. Jagadish", title = "Trajectory simplification: on minimizing the direction-based error", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "1", pages = "49--60", month = sep, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:33 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Trajectory data is central to many applications with moving objects. Raw trajectory data is usually very large, and so is simplified before it is stored and processed. Many trajectory simplification notions have been proposed, and among them, the direction-preserving trajectory simplification (DPTS) which aims at protecting the direction information has been shown to perform quite well. However, existing studies on DPTS require users to specify an error tolerance which users might not know how to set properly in some cases (e.g., the error tolerance could only be known at some future time and simply setting one error tolerance does not meet the needs since the simplified trajectories would usually be used in many different applications which accept different error tolerances). In these cases, a better solution is to minimize the error while achieving a pre-defined simplification size. For this purpose, in this paper, we define a problem called Min-Error and develop two exact algorithms and one 2-factor approximate algorithm for the problem. Extensive experiments on real datasets verified our algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{ElGebaly:2014:IIE, author = "Kareem {El Gebaly} and Parag Agrawal and Lukasz Golab and Flip Korn and Divesh Srivastava", title = "Interpretable and informative explanations of outcomes", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "1", pages = "61--72", month = sep, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:33 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we solve the following data summarization problem: given a multi-dimensional data set augmented with a binary attribute, how can we construct an interpretable and informative summary of the factors affecting the binary attribute in terms of the combinations of values of the dimension attributes? We refer to such summaries as explanation tables. We show the hardness of constructing optimally-informative explanation tables from data, and we propose effective and efficient heuristics. The proposed heuristics are based on sampling and include optimizations related to computing the information content of a summary from a sample of the data. Using real data sets, we demonstrate the advantages of explanation tables compared to related approaches that can be adapted to solve our problem, and we show significant performance benefits of our optimizations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2014:CIN, author = "Fei Li and H. V. Jagadish", title = "Constructing an interactive natural language interface for relational databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "1", pages = "73--84", month = sep, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:33 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Natural language has been the holy grail of query interface designers, but has generally been considered too hard to work with, except in limited specific circumstances. In this paper, we describe the architecture of an interactive natural language query interface for relational databases. Through a carefully limited interaction with the user, we are able to correctly interpret complex natural language queries, in a generic manner across a range of domains. By these means, a logically complex English language sentence is correctly translated into a SQL query, which may include aggregation, nesting, and various types of joins, among other things, and can be evaluated against an RDBMS. We have constructed a system, NaLIR (Natural Language Interface for Relational databases), embodying these ideas. Our experimental assessment, through user studies, demonstrates that NaLIR is good enough to be usable in practice: even naive users are able to specify quite complex ad-hoc queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhu:2014:LGD, author = "Yuanyuan Zhu and Jeffrey Xu Yu and Lu Qin", title = "Leveraging graph dimensions in online graph search", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "1", pages = "85--96", month = sep, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:33 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graphs have been widely used due to its expressive power to model complicated relationships. However, given a graph database $ D_g = \{ g_1, g_2, \ldots, g_n \} $, it is challenging to process graph queries since a basic graph query usually involves costly graph operations such as maximum common subgraph and graph edit distance computation, which are NP-hard. In this paper, we study a novel DS-preserved mapping which maps graphs in a graph database$ D_g $ onto a multidimensional space $ M_g $ under a structural dimension $M$ using a mapping function $ \phi $ (). The DS-preserved mapping preserves two things: distance and structure. By the distance-preserving, it means that any two graphs $ g_i$ and $ g_j$ in $ D_g$ must map to two data objects $ \phi (g_i)$ and $ \phi (g_j)$ in $ M_g$, such that the distance, $ d(\phi (g_i), \phi (g_j))$, between $ \phi (g_i)$ and $ \phi (g_j)$ in $ M_g$ approximates the graph dissimilarity $ \delta (g_i, g_j)$ in $ D_g$. By the structure-preserving, it further means that for a given unseen query graph $q$, the distance between $q$ and any graph $ g_i$ in $ D_g$ needs to be preserved such that $ \delta (q, g_i) \approx d(\phi (q), \phi (g_i))$. We discuss the rationality of using graph dimension $M$ for online graph processing, and show how to identify a small set of subgraphs to form $M$ efficiently. We propose an iterative algorithm DSPM to compute the graph dimension, and discuss its optimization techniques. We also give an approximate algorithm DSPMap in order to handle a large graph database. We conduct extensive performance studies on both real and synthetic datasets to evaluate the top-$k$ similarity query which is to find top-$k$ similar graphs from $ D_g$ for a query graph, and show the effectiveness and efficiency of our approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sidlauskas:2014:SJM, author = "Darius Sidlauskas and Christian S. Jensen", title = "Spatial joins in main memory: implementation matters!", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "1", pages = "97--100", month = sep, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:33 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A recent PVLDB paper reports on experimental analyses of ten spatial join techniques in main memory. We build on this comprehensive study to raise awareness of the fact that empirical running time performance findings in main-memory settings are results of not only the algorithms and data structures employed, but also their implementation, which complicates the interpretation of the results. In particular, we re-implement the worst performing technique without changing the underlying high-level algorithm, and we then offer evidence that the resulting re-implementation is capable of outperforming all the other techniques. This study demonstrates that in main memory, where no time-consuming I/O can mask variations in implementation, implementation details are very important; and it offers a concrete illustration of how it is difficult to make conclusions from empirical running time performance findings in main-memory settings about data structures and algorithms studied.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2014:SES, author = "Xiaoyang Wang and Ying Zhang and Wenjie Zhang and Xuemin Lin and Wei Wang", title = "Selectivity estimation on streaming spatio-textual data using local correlations", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "2", pages = "101--112", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we investigate the selectivity estimation problem for streaming spatio-textual data, which arises in many social network and geo-location applications. Specifically, given a set of continuously and rapidly arriving spatio-textual objects, each of which is described by a geo-location and a short text, we aim to accurately estimate the cardinality of a spatial keyword query on objects seen so far, where a spatial keyword query consists of a search region and a set of query keywords. To the best of our knowledge, this is the first work to address this important problem. We first extend two existing techniques to solve this problem, and show their limitations. Inspired by two key observations on the ``locality'' of the correlations among query keywords, we propose a local correlation based method by utilizing an augmented adaptive space partition tree ($ A^2 $SP-tree for short) to approximately learn a local Bayesian network on-the-fly for a given query and estimate its selectivity. A novel local boosting approach is presented to further enhance the learning accuracy of local Bayesian networks. Our comprehensive experiments on real-life datasets demonstrate the superior performance of the local correlation based algorithm in terms of estimation accuracy compared to other competitors.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2014:PMK, author = "Chuanwen Li and Yu Gu and Jianzhong Qi and Ge Yu and Rui Zhang and Wang Yi", title = "Processing moving $k$ {NN} queries using influential neighbor sets", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "2", pages = "113--124", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The moving $k$ nearest neighbor query, which computes one's $k$ nearest neighbor set and maintains it while at move, is gaining importance due to the prevalent use of smart mobile devices such as smart phones. Safe region is a popular technique in processing the moving $k$ nearest neighbor query. It is a region where the movement of the query object does not cause the current $k$ nearest neighbor set to change. Processing a moving $k$ nearest neighbor query is a continuing process of checking the validity of the safe region and recomputing it if invalidated. The size of the safe region largely decides the frequency of safe region recomputation and hence query processing efficiency. Existing moving $k$ nearest neighbor algorithms lack efficiency due to either computing small safe regions and have to recompute frequently or computing large safe regions (i.e., an order-$k$ Voronoi cell) with a high cost. In this paper, we take a third approach. Instead of safe regions, we use a small set of safe guarding objects. We prove that, as long as the the current $k$ nearest neighbors are closer to the query object than the safe guarding objects, the current $k$ nearest neighbors stay valid and no recomputation is required. This way, we avoid the high cost of safe region recomputation. We also prove that, the region defined by the safe guarding objects is the largest possible safe region. This means that the recomputation frequency of our method is also minimized. We conduct extensive experiments comparing our method with the state-of-the-art method on both real and synthetic data sets. The results confirm the superiority of our method.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mozafari:2014:SCS, author = "Barzan Mozafari and Purna Sarkar and Michael Franklin and Michael Jordan and Samuel Madden", title = "Scaling up crowd-sourcing to very large datasets: a case for active learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "2", pages = "125--136", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Crowd-sourcing has become a popular means of acquiring labeled data for many tasks where humans are more accurate than computers, such as image tagging, entity resolution, and sentiment analysis. However, due to the time and cost of human labor, solutions that rely solely on crowd-sourcing are often limited to small datasets (i.e., a few thousand items). This paper proposes algorithms for integrating machine learning into crowd-sourced databases in order to combine the accuracy of human labeling with the speed and cost-effectiveness of machine learning classifiers. By using active learning as our optimization strategy for labeling tasks in crowd-sourced databases, we can minimize the number of questions asked to the crowd, allowing crowd-sourced applications to scale (i.e., label much larger datasets at lower costs). Designing active learning algorithms for a crowd-sourced database poses many practical challenges: such algorithms need to be generic, scalable, and easy to use, even for practitioners who are not machine learning experts. We draw on the theory of nonparametric bootstrap to design, to the best of our knowledge, the first active learning algorithms that meet all these requirements. Our results, on 3 real-world datasets collected with Amazons Mechanical Turk, and on 15 UCI datasets, show that our methods on average ask 1--2 orders of magnitude fewer questions than the baseline, and $ 4.5$--$ 44 \times $ fewer than existing active learning algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2014:CCO, author = "Dingyu Yang and Dongxiang Zhang and Kian-Lee Tan and Jian Cao and Fr{\'e}d{\'e}ric {Le Mou{\"e}l}", title = "{CANDS}: continuous optimal navigation via distributed stream processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "2", pages = "137--148", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Shortest path query over a dynamic road network is a prominent problem for the optimization of real-time traffic systems. Existing solutions rely either on a centralized index system with tremendous pre-computation overhead, or on a distributed graph processing system such as Pregel that requires much synchronization effort. However, the performance of these systems degenerates with frequent route path updates caused by continuous traffic condition change. In this paper, we build CANDS, a distributed stream processing platform for continuous optimal shortest path queries. It provides an asynchronous solution to answering a large quantity of shortest path queries. It is able to efficiently detect affected paths and adjust their paths in the face of traffic updates. Moreover, the affected paths can be quickly updated to the optimal solutions throughout the whole navigation process. Experimental results demonstrate that the performance for answering shortest path queries by CANDS is two orders of magnitude better than that of GPS, an open-source implementation of Pregel. In addition, CANDS provides fast response to traffic updates to guarantee the optimality of answering shortest path queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Begum:2014:RTS, author = "Nurjahan Begum and Eamonn Keogh", title = "Rare time series motif discovery from unbounded streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "2", pages = "149--160", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The detection of time series motifs, which are approximately repeated subsequences in time series streams, has been shown to have great utility as a subroutine in many higher-level data mining algorithms. However, this detection becomes much harder in cases where the motifs of interest are vanishingly rare or when faced with a never-ending stream of data. In this work we investigate algorithms to find such rare motifs. We demonstrate that under reasonable assumptions we must abandon any hope of an exact solution to the motif problem as it is normally defined; however, we introduce algorithms that allow us to solve the underlying problem with high probability.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bu:2014:PBG, author = "Yingyi Bu and Vinayak Borkar and Jianfeng Jia and Michael J. Carey and Tyson Condie", title = "{Pregelix}: {Big(ger)} graph analytics on a dataflow engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "2", pages = "161--172", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "There is a growing need for distributed graph processing systems that are capable of gracefully scaling to very large graph datasets. Unfortunately, this challenge has not been easily met due to the intense memory pressure imposed by process-centric, message passing designs that many graph processing systems follow. Pregelix is a new open source distributed graph processing system that is based on an iterative dataflow design that is better tuned to handle both in-memory and out-of-core workloads. As such, Pregelix offers improved performance characteristics and scaling properties over current open source systems (e.g., we have seen up to $ 15 \times $ speedup compared to Apache Giraph and up to $ 35 \times $ speedup compared to distributed GraphLab), and more effective use of available machine resources to support Big(ger) Graph Analytics.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sridharan:2014:PRC, author = "Shriram Sridharan and Jignesh M. Patel", title = "Profiling {R} on a contemporary processor", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "2", pages = "173--184", month = oct, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/s-plus.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "R is a popular data analysis language, but there is scant experimental data characterizing the run-time profile of R programs. This paper addresses this limitation by systematically cataloging where time is spent when running R programs. Our evaluation using four different workloads shows that when analyzing large datasets, R programs (a) spend more than 85\% of their time in processor stalls, which leads to slower execution times, (b) trigger the garbage collector frequently, which leads to higher memory stalls, and (c) create a large number of unnecessary temporary objects that causes R to swap to disk quickly even for datasets that are far smaller than the available main memory. Addressing these issues should allow R programs to run faster than they do today, and allow R to be used for analyzing even larger datasets. As outlined in this paper, the results presented in this paper motivate a number of future research investigations in the database, architecture, and programming language communities. All data and code that is used in this paper (which includes the R programs, and changes to the R source code for instrumentation) can be found at: {\tt http://quickstep.cs.wisc.edu/dissecting-R/}.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bailis:2014:CAD, author = "Peter Bailis and Alan Fekete and Michael J. Franklin and Ali Ghodsi and Joseph M. Hellerstein and Ion Stoica", title = "Coordination avoidance in database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "3", pages = "185--196", month = nov, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Minimizing coordination, or blocking communication between concurrently executing operations, is key to maximizing scalability, availability, and high performance in database systems. However, uninhibited coordination-free execution can compromise application correctness, or consistency. When is coordination necessary for correctness? The classic use of serializable transactions is sufficient to maintain correctness but is not necessary for all applications, sacrificing potential scalability. In this paper, we develop a formal framework, invariant confluence, that determines whether an application requires coordination for correct execution. By operating on application-level invariants over database states (e.g., integrity constraints), invariant confluence analysis provides a necessary and sufficient condition for safe, coordination-free execution. When programmers specify their application invariants, this analysis allows databases to coordinate only when anomalies that might violate invariants are possible. We analyze the invariant confluence of common invariants and operations from real-world database systems (i.e., integrity constraints) and applications and show that many are invariant confluent and therefore achievable without coordination. We apply these results to a proof-of-concept coordination-avoiding database prototype and demonstrate sizable performance gains compared to serializable execution, notably a 25-fold improvement over prior TPC-C New-Order performance on a 200 server cluster.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zeng:2014:QSI, author = "Qiang Zeng and Jignesh M. Patel and David Page", title = "{QuickFOIL}: scalable inductive logic programming", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "3", pages = "197--208", month = nov, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Inductive Logic Programming (ILP) is a classic machine learning technique that learns first-order rules from relational-structured data. However, to-date most ILP systems can only be applied to small datasets (tens of thousands of examples). A long-standing challenge in the field is to scale ILP methods to larger data sets. This paper presents a method called QuickFOIL that addresses this limitation. QuickFOIL employs a new scoring function and a novel pruning strategy that enables the algorithm to find high-quality rules. QuickFOIL can also be implemented as an in-RDBMS algorithm. Such an implementation presents a host of query processing and optimization challenges that we address in this paper. Our empirical evaluation shows that QuickFOIL can scale to large datasets consisting of hundreds of millions tuples, and is often more than order of magnitude more efficient than other existing approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yu:2014:SAE, author = "Xiangyao Yu and George Bezerra and Andrew Pavlo and Srinivas Devadas and Michael Stonebraker", title = "Staring into the abyss: an evaluation of concurrency control with one thousand cores", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "3", pages = "209--220", month = nov, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Computer architectures are moving towards an era dominated by many-core machines with dozens or even hundreds of cores on a single chip. This unprecedented level of on-chip parallelism introduces a new dimension to scalability that current database management systems (DBMSs) were not designed for. In particular, as the number of cores increases, the problem of concurrency control becomes extremely challenging. With hundreds of threads running in parallel, the complexity of coordinating competing accesses to data will likely diminish the gains from increased core counts. To better understand just how unprepared current DBMSs are for future CPU architectures, we performed an evaluation of concurrency control for on-line transaction processing (OLTP) workloads on many-core chips. We implemented seven concurrency control algorithms on a main-memory DBMS and using computer simulations scaled our system to 1024 cores. Our analysis shows that all algorithms fail to scale to this magnitude but for different reasons. In each case, we identify fundamental bottlenecks that are independent of the particular database implementation and argue that even state-of-the-art DBMSs suffer from these limitations. We conclude that rather than pursuing incremental solutions, many-core chips may require a completely redesigned DBMS architecture that is built from ground up and is tightly coupled with the hardware.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Trummer:2014:MOP, author = "Immanuel Trummer and Christoph Koch", title = "Multi-objective parametric query optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "3", pages = "221--232", month = nov, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Classical query optimization compares query plans according to one cost metric and associates each plan with a constant cost value. In this paper, we introduce the Multi-Objective Parametric Query Optimization (MPQ) problem where query plans are compared according to multiple cost metrics and the cost of a given plan according to a given metric is modeled as a function that depends on multiple parameters. The cost metrics may for instance include execution time or monetary fees; a parameter may represent the selectivity of a query predicate that is unspecified at optimization time. MPQ generalizes parametric query optimization (which allows multiple parameters but only one cost metric) and multi-objective query optimization (which allows multiple cost metrics but no parameters). We formally analyze the novel MPQ problem and show why existing algorithms are inapplicable. We present a generic algorithm for MPQ and a specialized version for MPQ with piecewise-linear plan cost functions. We prove that both algorithms find all relevant query plans and experimentally evaluate the performance of our second algorithm in a Cloud computing scenario.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Giceva:2014:DQP, author = "Jana Giceva and Gustavo Alonso and Timothy Roscoe and Tim Harris", title = "Deployment of query plans on multicores", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "3", pages = "233--244", month = nov, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Efficient resource scheduling of multithreaded software on multicore hardware is difficult given the many parameters involved and the hardware heterogeneity of existing systems. In this paper we explore the efficient deployment of query plans over a multicore machine. We focus on shared query systems, and implement the proposed ideas using SharedDB. The goal of the paper is to explore how to deliver maximum performance and predictability, while minimizing resource utilization when deploying query plans on multicore machines. We propose to use resource activity vectors to characterize the behavior of individual database operators. We then present a novel deployment algorithm which uses these vectors together with dataflow information from the query plan to optimally assign relational operators to physical cores. Experiments demonstrate that this approach significantly reduces resource requirements while preserving performance and is robust across different server architectures.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Taft:2014:SFG, author = "Rebecca Taft and Essam Mansour and Marco Serafini and Jennie Duggan and Aaron J. Elmore and Ashraf Aboulnaga and Andrew Pavlo and Michael Stonebraker", title = "{E-store}: fine-grained elastic partitioning for distributed transaction processing systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "3", pages = "245--256", month = nov, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "On-line transaction processing (OLTP) database management systems (DBMSs) often serve time-varying workloads due to daily, weekly or seasonal fluctuations in demand, or because of rapid growth in demand due to a company's business success. In addition, many OLTP workloads are heavily skewed to ``hot'' tuples or ranges of tuples. For example, the majority of NYSE volume involves only 40 stocks. To deal with such fluctuations, an OLTP DBMS needs to be elastic; that is, it must be able to expand and contract resources in response to load fluctuations and dynamically balance load as hot tuples vary over time. This paper presents E-Store, an elastic partitioning framework for distributed OLTP DBMSs. It automatically scales resources in response to demand spikes, periodic events, and gradual changes in an application's workload. E-Store addresses localized bottlenecks through a two-tier data placement strategy: cold data is distributed in large chunks, while smaller ranges of hot tuples are assigned explicitly to individual nodes. This is in contrast to traditional single-tier hash and range partitioning strategies. Our experimental evaluation of E-Store shows the viability of our approach and its efficacy under variations in load across a cluster of machines. Compared to single-tier approaches, E-Store improves throughput by up to 130\% while reducing latency by 80\%.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Thirumuruganathan:2014:BIM, author = "Saravanan Thirumuruganathan and Habibur Rahman and Sofiane Abbar and Gautam Das", title = "Beyond itemsets: mining frequent featuresets over structured items", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "3", pages = "257--268", month = nov, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We assume a dataset of transactions generated by a set of users over structured items where each item could be described through a set of features. In this paper, we are interested in identifying the frequent featuresets (set of features) by mining item transactions. For example, in a news website, items correspond to news articles, the features are the named-entities/topics in the articles and an item transaction would be the set of news articles read by a user within the same session. We show that mining frequent featuresets over structured item transactions is a novel problem and show that straightforward extensions of existing frequent itemset mining techniques provide unsatisfactory results. This is due to the fact that while users are drawn to each item in the transaction due to a subset of its features, the transaction by itself does not provide any information about such underlying preferred features of users. In order to overcome this hurdle, we propose a featureset uncertainty model where each item transaction could have been generated by various featuresets with different probabilities. We describe a novel approach to transform item transactions into uncertain transaction over featuresets and estimate their probabilities using constrained least squares based approach. We propose diverse algorithms to mine frequent featuresets. Our experimental evaluation provides a comparative analysis of the different approaches proposed.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2014:ICD, author = "Jun Zhang and Chaokun Wang and Jianmin Wang and Jeffrey Xu Yu", title = "Inferring continuous dynamic social influence and personal preference for temporal behavior prediction", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "3", pages = "269--280", month = nov, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "It is always attractive and challenging to explore the intricate behavior data and uncover people's motivations, preference and habits, which can greatly benefit many tasks including link prediction, item recommendation, etc. Traditional work usually studies people's behaviors without time information in a static or discrete manner, assuming the underlying factors stay invariant in a long period. However, we believe people's behaviors are dynamic, and the contributing factors including the social influence and personal preference for behaviors are varying continuously over time. Such continuous dynamics convey important knowledge about people's behavior patterns; ignoring them would lead to inaccurate models. In this work, we address the continuous dynamic modeling of temporal behaviors. To model the fully continuous temporal dynamics of behaviors and the underlying factors, we propose the DP-Space, a dynamic preference probability space, which can capture their smooth variation in various shapes over time with flexible basis functions. Upon that we propose a generative dynamic behavior model, ConTyor, which considers the temporal item-adoption behaviors as joint effect of dynamic social influence and varying personal preference over continuous time. We also develop effective inference methods for ConTyor and present its applications. We conduct a comprehensive experimental study using real-world datasets to evaluate the effectiveness of our model and the temporal modeling. Results verify that ConTyor outperforms existing state-of-the-art static and temporal models in behavior predictions. Moreover, in our detailed study on temporal modeling, we show that temporal modeling is superior to static approaches and modeling over continuous time is further better than that over discrete time. We also demonstrate that the ancient behavior data can still become important and beneficial if modeled well.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lu:2014:LSD, author = "Yi Lu and James Cheng and Da Yan and Huanhuan Wu", title = "Large-scale distributed graph computing systems: an experimental evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "3", pages = "281--292", month = nov, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the prevalence of graph data in real-world applications (e.g., social networks, mobile phone networks, web graphs, etc.) and their ever-increasing size, many distributed graph computing systems have been developed in recent years to process and analyze massive graphs. Most of these systems adopt Pregel's vertex-centric computing model, while various techniques have been proposed to address the limitations in the Pregel framework. However, there is a lack of comprehensive comparative analysis to evaluate the performance of various systems and their techniques, making it difficult for users to choose the best system for their applications. We conduct extensive experiments to evaluate the performance of existing systems on graphs with different characteristics and on algorithms with different design logic. We also study the effectiveness of various techniques adopted in existing systems, and the scalability of the systems. The results of our study reveal the strengths and limitations of existing systems, and provide valuable insights for users, researchers and system developers.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Inoue:2014:FSI, author = "Hiroshi Inoue and Moriyoshi Ohara and Kenjiro Taura", title = "Faster set intersection with {SIMD} instructions by reducing branch mispredictions", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "3", pages = "293--304", month = nov, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Set intersection is one of the most important operations for many applications such as Web search engines or database management systems. This paper describes our new algorithm to efficiently find set intersections with sorted arrays on modern processors with SIMD instructions and high branch misprediction penalties. Our algorithm efficiently exploits SIMD instructions and can drastically reduce branch mispredictions. Our algorithm extends a merge-based algorithm by reading multiple elements, instead of just one element, from each of two input arrays and compares all of the pairs of elements from the two arrays to find the elements with the same values. The key insight for our improvement is that we can reduce the number of costly hard-to-predict conditional branches by advancing a pointer by more than one element at a time. Although this algorithm increases the total number of comparisons, we can execute these comparisons more efficiently using the SIMD instructions and gain the benefits of the reduced branch misprediction overhead. Our algorithm is suitable to replace existing standard library functions, such as {\tt std::set\_intersection} in C++, thus accelerating many applications, because the algorithm is simple and requires no preprocessing to generate additional data structures. We implemented our algorithm on Xeon and POWER7+. The experimental results show our algorithm outperforms the {\tt std::set\_intersection} implementation delivered with gcc by up to 5.2x using SIMD instructions and by up to 2.1x even without using SIMD instructions for 32-bit and 64-bit integer datasets. Our SIMD algorithm also outperformed an existing algorithm that can leverage SIMD instructions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{El-Kishky:2014:STP, author = "Ahmed El-Kishky and Yanglei Song and Chi Wang and Clare R. Voss and Jiawei Han", title = "Scalable topical phrase mining from text corpora", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "3", pages = "305--316", month = nov, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "While most topic modeling algorithms model text corpora with unigrams, human interpretation often relies on inherent grouping of terms into phrases. As such, we consider the problem of discovering topical phrases of mixed lengths. Existing work either performs post processing to the results of unigram-based topic models, or utilizes complex $n$-gram-discovery topic models. These methods generally produce low-quality topical phrases or suffer from poor scalability on even moderately-sized datasets. We propose a different approach that is both computationally efficient and effective. Our solution combines a novel phrase mining framework to segment a document into single and multi-word phrases, and a new topic model that operates on the induced document partition. Our approach discovers high quality topical phrases with negligible extra cost to the bag-of-words topic model in a variety of datasets including research publication titles, abstracts, reviews, and news articles.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tao:2014:ETK, author = "Wenbo Tao and Minghe Yu and Guoliang Li", title = "Efficient top-$k$ simrank-based similarity join", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "3", pages = "317--328", month = nov, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "SimRank is a popular and widely-adopted similarity measure to evaluate the similarity between nodes in a graph. It is time and space consuming to compute the SimRank similarities for all pairs of nodes, especially for large graphs. In real-world applications, users are only interested in the most similar pairs. To address this problem, in this paper we study the top-$k$ SimRank-based similarity join problem, which finds $k$ most similar pairs of nodes with the largest SimRank similarities among all possible pairs. To the best of our knowledge, this is the first attempt to address this problem. We encode each node as a vector by summarizing its neighbors and transform the calculation of the SimRank similarity between two nodes to computing the dot product between the corresponding vectors. We devise an efficient two-step framework to compute top-$k$ similar pairs using the vectors. For large graphs, exact algorithms cannot meet the high-performance requirement, and we also devise an approximate algorithm which can efficiently identify top-$k$ similar pairs under user-specified accuracy requirement. Experiments on both real and synthetic datasets show our method achieves high performance and good scalability.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{He:2014:CQC, author = "Jiong He and Shuhao Zhang and Bingsheng He", title = "In-cache query co-processing on coupled {CPU--GPU} architectures", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "4", pages = "329--340", month = dec, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently, there have been some emerging processor designs that the CPU and the GPU (Graphics Processing Unit) are integrated in a single chip and share Last Level Cache (LLC). However, the main memory bandwidth of such coupled CPU-GPU architectures can be much lower than that of a discrete GPU. As a result, current GPU query co-processing paradigms can severely suffer from memory stalls. In this paper, we propose a novel in-cache query co-processing paradigm for main memory On-Line Analytical Processing (OLAP) databases on coupled CPU-GPU architectures. Specifically, we adapt CPU-assisted prefetching to minimize cache misses in GPU query co-processing and CPU-assisted decompression to improve query execution performance. Furthermore, we develop a cost model guided adaptation mechanism for distributing the workload of prefetching, decompression, and query execution between CPU and GPU. We implement a system prototype and evaluate it on two recent AMD APUs A8 and A10. The experimental results show that (1) in-cache query co-processing can effectively improve the performance of the state-of-the-art GPU co-processing paradigm by up to 30\% and 33\% on A8 and A10, respectively, and (2) our workload distribution adaption mechanism can significantly improve the query performance by up to 36\% and 40\% on A8 and A10, respectively.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fujiwara:2014:SMR, author = "Yasuhiro Fujiwara and Go Irie and Shari Kuroyama and Makoto Onizuka", title = "Scaling manifold ranking based image retrieval", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "4", pages = "341--352", month = dec, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Manifold Ranking is a graph-based ranking algorithm being successfully applied to retrieve images from multimedia databases. Given a query image, Manifold Ranking computes the ranking scores of images in the database by exploiting the relationships among them expressed in the form of a graph. Since Manifold Ranking effectively utilizes the global structure of the graph, it is significantly better at finding intuitive results compared with current approaches. Fundamentally, Manifold Ranking requires an inverse matrix to compute ranking scores and so needs $ O(n^3) $ time, where $n$ is the number of images. Manifold Ranking, unfortunately, does not scale to support databases with large numbers of images. Our solution, Mogul, is based on two ideas: (1) It efficiently computes ranking scores by sparse matrices, and (2) It skips unnecessary score computations by estimating upper bounding scores. These two ideas reduce the time complexity of Mogul to $ O(n)$ from $ O(n^3)$ of the inverse matrix approach. Experiments show that Mogul is much faster and gives significantly better retrieval quality than a state-of-the-art approximation approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Barber:2014:MEH, author = "R. Barber and G. Lohman and I. Pandis and V. Raman and R. Sidle and G. Attaluri and N. Chainani and S. Lightstone and D. Sharpe", title = "Memory-efficient hash joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "4", pages = "353--364", month = dec, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present new hash tables for joins, and a hash join based on them, that consumes far less memory and is usually faster than recently published in-memory joins. Our hash join is not restricted to outer tables that fit wholly in memory. Key to this hash join is a new concise hash table (CHT), a linear probing hash table that has 100\% fill factor, and uses a sparse bitmap with embedded population counts to almost entirely avoid collisions. This bitmap also serves as a Bloom filter for use in multi-table joins. We study the random access characteristics of hash joins, and renew the case for non-partitioned hash joins. We introduce a variant of partitioned joins in which only the build is partitioned, but the probe is not, as this is more efficient for large outer tables than traditional partitioned joins. This also avoids partitioning costs during the probe, while at the same time allowing parallel build without latching overheads. Additionally, we present a variant of CHT, called a concise array table (CAT), that can be used when the key domain is moderately dense. CAT is collision-free and avoids storing join keys in the hash table. We perform a detailed comparison of CHT and CAT against leading in-memory hash joins. Our experiments show that we can reduce the memory usage by one to three orders of magnitude, while also being competitive in performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Alexe:2014:PAI, author = "Bogdan Alexe and Mary Roth and Wang-Chiew Tan", title = "Preference-aware integration of temporal data", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "4", pages = "365--376", month = dec, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A complete description of an entity is rarely contained in a single data source, but rather, it is often distributed across different data sources. Applications based on personal electronic health records, sentiment analysis, and financial records all illustrate that significant value can be derived from integrated, consistent, and queryable profiles of entities from different sources. Even more so, such integrated profiles are considerably enhanced if temporal information from different sources is carefully accounted for. We develop a simple and yet versatile operator, called prawn, that is typically called as a final step of an entity integration workflow. Prawn is capable of consistently integrating and resolving temporal conflicts in data that may contain multiple dimensions of time based on a set of preference rules specified by a user (hence the name prawn for preference-aware union). In the event that not all conflicts can be resolved through preferences, one can enumerate each possible consistent interpretation of the result returned by prawn at a given time point through a polynomial-delay algorithm. In addition to providing algorithms for implementing prawn, we study and establish several desirable properties of prawn. First, prawn produces the same temporally integrated outcome, modulo representation of time, regardless of the order in which data sources are integrated. Second, prawn can be customized to integrate temporal data for different applications by specifying application-specific preference rules. Third, we show experimentally that our implementation of prawn is feasible on both ``small'' and ``big'' data platforms in that it is efficient in both storage and execution time. Finally, we demonstrate a fundamental advantage of prawn: we illustrate that standard query languages can be immediately used to pose useful temporal queries over the integrated and resolved entity repository.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhou:2014:MSD, author = "Chang Zhou and Jun Gao and Binbin Sun and Jeffrey Xu Yu", title = "{MOCgraph}: scalable distributed graph processing using message online computing", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "4", pages = "377--388", month = dec, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Existing distributed graph processing frameworks, e.g., Pregel, Giraph, GPS and GraphLab, mainly exploit main memory to support flexible graph operations for efficiency. Due to the complexity of graph analytics, huge memory space is required especially for those graph analytics that spawn large intermediate results. Existing frameworks may terminate abnormally or degrade performance seriously when the memory is exhausted or the external storage has to be used. In this paper, we propose MOCgraph, a scalable distributed graph processing framework to reduce the memory footprint and improve the scalability, based on message online computing. MOCgraph consumes incoming messages in a streaming manner, so as to handle larger graphs or more complex analytics with the same memory capacity. MOCgraph also exploits message online computing with external storage to provide an efficient out-of-core support. We implement MOCgraph on top of Apache Giraph, and test it against several representative graph algorithms on large graph datasets. Experiments illustrate that MOCgraph is efficient and memory-saving, especially for graph analytics with large intermediate results.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2014:NAL, author = "Jian Huang and Karsten Schwan and Moinuddin K. Qureshi", title = "{NVRAM-aware} logging in transaction systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "4", pages = "389--400", month = dec, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Emerging byte-addressable, non-volatile memory technologies (NVRAM) like phase-change memory can increase the capacity of future memory systems by orders of magnitude. Compared to systems that rely on disk storage, NVRAM-based systems promise significant improvements in performance for key applications like online transaction processing (OLTP). Unfortunately, NVRAM systems suffer from two drawbacks: their asymmetric read-write performance and the notable higher cost of the new memory technologies compared to disk. This paper investigates the cost-effective use of NVRAM in transaction systems. It shows that using NVRAM only for the logging subsystem (NV-Logging) provides much higher transactions per dollar than simply replacing all disk storage with NVRAM. Specifically, for NV-Logging, we show that the software overheads associated with centralized log buffers cause performance bottlenecks and limit scaling. The per-transaction logging methods described in the paper help avoid these overheads, enabling concurrent logging for multiple transactions. Experimental results with a faithful emulation of future NVRAM-based servers using the TPCC, TATP, and TPCB benchmarks show that NV-Logging improves throughput by 1.42 --- 2.72x over the costlier option of replacing all disk storage with NVRAM. Results also show that NV-Logging performs 1.21 --- 6.71x better than when logs are placed into the PMFS NVRAM-optimized file system. Compared to state-of-the-art distributed logging, NV-Logging delivers 20.4\% throughput improvements.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chandramouli:2014:THP, author = "Badrish Chandramouli and Jonathan Goldstein and Mike Barnett and Robert DeLine and Danyel Fisher and John C. Platt and James F. Terwilliger and John Wernsing", title = "{Trill}: a high-performance incremental query processor for diverse analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "4", pages = "401--412", month = dec, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper introduces Trill --- a new query processor for analytics. Trill fulfills a combination of three requirements for a query processor to serve the diverse big data analytics space: (1) Query Model: Trill is based on a tempo-relational model that enables it to handle streaming and relational queries with early results, across the latency spectrum from real-time to offline; (2) Fabric and Language Integration: Trill is architected as a high-level language library that supports rich data-types and user libraries, and integrates well with existing distribution fabrics and applications; and (3) Performance: Trill's throughput is high across the latency spectrum. For streaming data, Trill's throughput is 2--4 orders of magnitude higher than comparable streaming engines. For offline relational queries, Trill's throughput is comparable to a major modern commercial columnar DBMS. Trill uses a streaming batched-columnar data representation with a new dynamic compilation-based system architecture that addresses all these requirements. In this paper, we describe Trill's new design and architecture, and report experimental results that demonstrate Trill's high performance across diverse analytics scenarios. We also describe how Trill's ability to support diverse analytics has resulted in its adoption across many usage scenarios at Microsoft.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Song:2014:EPM, author = "Chunyao Song and Tingjian Ge and Cindy Chen and Jie Wang", title = "Event pattern matching over graph streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "4", pages = "413--424", month = dec, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A graph is a fundamental and general data structure underlying all data applications. Many applications today call for the management and query capabilities directly on graphs. Real time graph streams, as seen in road networks, social and communication networks, and web requests, are such applications. Event pattern matching requires the awareness of graph structures, which is different from traditional complex event processing. It also requires a focus on the dynamicity of the graph, time order constraints in patterns, and online query processing, which deviates significantly from previous work on subgraph matching as well. We study the semantics and efficient online algorithms for this important and intriguing problem, and evaluate our approaches with extensive experiments over real world datasets in four different domains.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2014:CAA, author = "Qi Li and Yaliang Li and Jing Gao and Lu Su and Bo Zhao and Murat Demirbas and Wei Fan and Jiawei Han", title = "A confidence-aware approach for truth discovery on long-tail data", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "4", pages = "425--436", month = dec, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In many real world applications, the same item may be described by multiple sources. As a consequence, conflicts among these sources are inevitable, which leads to an important task: how to identify which piece of information is trustworthy, i.e., the truth discovery task. Intuitively, if the piece of information is from a reliable source, then it is more trustworthy, and the source that provides trustworthy information is more reliable. Based on this principle, truth discovery approaches have been proposed to infer source reliability degrees and the most trustworthy information (i.e., the truth) simultaneously. However, existing approaches overlook the ubiquitous long-tail phenomenon in the tasks, i.e., most sources only provide a few claims and only a few sources make plenty of claims, which causes the source reliability estimation for small sources to be unreasonable. To tackle this challenge, we propose a confidence-aware truth discovery (CATD) method to automatically detect truths from conflicting data with long-tail phenomenon. The proposed method not only estimates source reliability, but also considers the confidence interval of the estimation, so that it can effectively reflect real source reliability for sources with various levels of participation. Experiments on four real world tasks as well as simulated multi-source long-tail datasets demonstrate that the proposed method outperforms existing state-of-the-art truth discovery approaches by successful discounting the effect of small sources.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shen:2014:FFR, author = "Yanyan Shen and Gang Chen and H. V. Jagadish and Wei Lu and Beng Chin Ooi and Bogdan Marius Tudor", title = "Fast failure recovery in distributed graph processing systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "4", pages = "437--448", month = dec, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed graph processing systems increasingly require many compute nodes to cope with the requirements imposed by contemporary graph-based Big Data applications. However, increasing the number of compute nodes increases the chance of node failures. Therefore, provisioning an efficient failure recovery strategy is critical for distributed graph processing systems. This paper proposes a novel recovery mechanism for distributed graph processing systems that parallelizes the recovery process. The key idea is to partition the part of the graph that is lost during a failure among a subset of the remaining nodes. To do so, we augment the existing checkpoint-based and log-based recovery schemes with a partitioning mechanism that is sensitive to the total computation and communication cost of the recovery process. Our implementation on top of the widely used Giraph system outperforms checkpoint-based recovery by up to 30x on a cluster of 40 compute nodes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Then:2014:MME, author = "Manuel Then and Moritz Kaufmann and Fernando Chirigati and Tuan-Anh Hoang-Vu and Kien Pham and Alfons Kemper and Thomas Neumann and Huy T. Vo", title = "The more the merrier: efficient multi-source graph traversal", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "4", pages = "449--460", month = dec, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph analytics on social networks, Web data, and communication networks has been widely used in a plethora of applications. Many graph analytics algorithms are based on breadth-first search (BFS) graph traversal, which is not only time-consuming for large datasets but also involves much redundant computation when executed multiple times from different start vertices. In this paper, we propose Multi-Source BFS (MS-BFS), an algorithm that is designed to run multiple concurrent BFSs over the same graph on a single CPU core while scaling up as the number of cores increases. MS-BFS leverages the properties of small-world networks, which apply to many real-world graphs, and enables efficient graph traversal that: (i) shares common computation across concurrent BFSs; (ii) greatly reduces the number of random memory accesses; and (iii) does not incur synchronization costs. We demonstrate how a real graph analytics application---all-vertices closeness centrality---can be efficiently solved with MS-BFS. Furthermore, we present an extensive experimental evaluation with both synthetic and real datasets, including Twitter and Wikipedia, showing that MS-BFS provides almost linear scalability with respect to the number of cores and excellent scalability for increasing graph sizes, outperforming state-of-the-art BFS algorithms by more than one order of magnitude when running a large number of BFSs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wandelt:2015:MCS, author = "Sebastian Wandelt and Ulf Leser", title = "{MRCSI}: compressing and searching string collections with multiple references", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "5", pages = "461--472", month = jan, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/string-matching.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Efficiently storing and searching collections of similar strings, such as large populations of genomes or long change histories of documents from Wikis, is a timely and challenging problem. Several recent proposals could drastically reduce space requirements by exploiting the similarity between strings in so-called reference-based compression. However, these indexes are usually not searchable any more, i.e., in these methods search efficiency is sacrificed for storage efficiency. We propose Multi-Reference Compressed Search Indexes (MRCSI) as a framework for efficiently compressing dissimilar string collections. In contrast to previous works which can use only a single reference for compression, MRCSI (a) uses multiple references for achieving increased compression rates, where the reference set need not be specified by the user but is determined automatically, and (b) supports efficient approximate string searching with edit distance constraints. We prove that finding the smallest MRCSI is NP-hard. We then propose three heuristics for computing MRCSIs achieving increasing compression ratios. Compared to state-of-the-art competitors, our methods target an interesting and novel sweet-spot between high compression ratio versus search efficiency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ding:2015:YFC, author = "Rui Ding and Qiang Wang and Yingnong Dang and Qiang Fu and Haidong Zhang and Dongmei Zhang", title = "{YADING}: fast clustering of large-scale time series data", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "5", pages = "473--484", month = jan, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Fast and scalable analysis techniques are becoming increasingly important in the era of big data, because they are the enabling techniques to create real-time and interactive experiences in data analysis. Time series are widely available in diverse application areas. Due to the large number of time series instances (e.g., millions) and the high dimensionality of each time series instance (e.g., thousands), it is challenging to conduct clustering on large-scale time series, and it is even more challenging to do so in real-time to support interactive exploration. In this paper, we propose a novel end-to-end time series clustering algorithm, YADING, which automatically clusters large-scale time series with fast performance and quality results. Specifically, YADING consists of three steps: sampling the input dataset, conducting clustering on the sampled dataset, and assigning the rest of the input data to the clusters generated on the sampled dataset. In particular, we provide theoretical proof on the lower and upper bounds of the sample size, which not only guarantees YADING's high performance, but also ensures the distribution consistency between the input dataset and the sampled dataset. We also select $ L_1 $ norm as similarity measure and the multi-density approach as the clustering method. With theoretical bound, this selection ensures YADING's robustness to time series variations due to phase perturbation and random noise. Evaluation results have demonstrated that on typical-scale (100,000 time series each with 1,000 dimensions) datasets, YADING is about 40 times faster than the state-of-the-art, sampling-based clustering algorithm DENCLUE 2.0, and about 1,000 times faster than DBSCAN and CLARANS. YADING has also been used by product teams at Microsoft to analyze service performance. Two of such use cases are shared in this paper.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2015:HWS, author = "Ting Wu and Lei Chen and Pan Hui and Chen Jason Zhang and Weikai Li", title = "Hear the whole story: towards the diversity of opinion in crowdsourcing markets", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "5", pages = "485--496", month = jan, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The recent surge in popularity of crowdsourcing has brought with it a new opportunity for engaging human intelligence in the process of data analysis. Crowdsourcing provides a fundamental mechanism for enabling online workers to participate in tasks that are either too difficult to be solved solely by a computer or too expensive to employ experts to perform. In the field of social science, four elements are required to form a wise crowd --- Diversity of Opinion, Independence, Decentralization and Aggregation. However, while the other three elements are already studied and implemented in current crowdsourcing platforms, the 'Diversity of Opinion' has not been functionally enabled. In this paper, we address the algorithmic optimizations towards the diversity of opinion of crowdsourcing marketplaces. From a computational perspective, in order to build a wise crowd, we need to quantitatively modeling the diversity, and take it into consideration for constructing the crowd. In a crowdsourcing marketplace, we usually encounter two basic paradigms for worker selection: building a crowd to wait for tasks to come and selecting workers for a given task. Therefore, we propose our Similarity-driven Model (S-Model) and Task-driven Model (T-Model) for both of the paradigms. Under both of the models, we propose efficient and effective algorithms to enlist a budgeted number of workers, which have the optimal diversity. We have verified our solutions with extensive experiments on both synthetic datasets and real data sets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chatzistergiou:2015:RUR, author = "Andreas Chatzistergiou and Marcelo Cintra and Stratis D. Viglas", title = "{REWIND}: recovery write-ahead system for in-memory non-volatile data-structures", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "5", pages = "497--508", month = jan, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent non-volatile memory (NVM) technologies, such as PCM, STT-MRAM and ReRAM, can act as both main memory and storage. This has led to research into NVM programming models, where persistent data structures remain in memory and are accessed directly through CPU loads and stores. Existing mechanisms for transactional updates are not appropriate in such a setting as they are optimized for block-based storage. We present REWIND, a user-mode library approach to managing transactional updates directly from user code written in an imperative general-purpose language. REWIND relies on a custom persistent in-memory data structure for the log that supports recoverable operations on itself. The scheme also employs a combination of non-temporal updates, persistent memory fences, and lightweight logging. Experimental results on synthetic transactional workloads and TPC-C show the overhead of REWIND compared to its non-recoverable equivalent to be within a factor of only 1.5 and 1.39 respectively. Moreover, REWIND outperforms state-of-the-art approaches for data structure recoverability as well as general purpose and NVM-aware DBMS-based recovery schemes by up to two orders of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2015:ICS, author = "Rong-Hua Li and Lu Qin and Jeffrey Xu Yu and Rui Mao", title = "Influential community search in large networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "5", pages = "509--520", month = jan, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Community search is a problem of finding densely connected subgraphs that satisfy the query conditions in a network, which has attracted much attention in recent years. However, all the previous studies on community search do not consider the influence of a community. In this paper, we introduce a novel community model called $k$-influential community based on the concept of $k$-core, which can capture the influence of a community. Based on the new community model, we propose a linear-time online search algorithm to find the top-$r$ $k$-influential communities in a network. To further speed up the influential community search algorithm, we devise a linear-space index structure which supports efficient search of the top-$r$ $k$-influential communities in optimal time. We also propose an efficient algorithm to maintain the index when the network is frequently updated. We conduct extensive experiments on 7 real-world large networks, and the results demonstrate the efficiency and effectiveness of the proposed methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kim:2015:RSV, author = "Albert Kim and Eric Blais and Aditya Parameswaran and Piotr Indyk and Sam Madden and Ronitt Rubinfeld", title = "Rapid sampling for visualizations with ordering guarantees", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "5", pages = "521--532", month = jan, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Visualizations are frequently used as a means to understand trends and gather insights from datasets, but often take a long time to generate. In this paper, we focus on the problem of rapidly generating approximate visualizations while preserving crucial visual properties of interest to analysts. Our primary focus will be on sampling algorithms that preserve the visual property of ordering; our techniques will also apply to some other visual properties. For instance, our algorithms can be used to generate an approximate visualization of a bar chart very rapidly, where the comparisons between any two bars are correct. We formally show that our sampling algorithms are generally applicable and provably optimal in theory, in that they do not take more samples than necessary to generate the visualizations with ordering guarantees. They also work well in practice, correctly ordering output groups while taking orders of magnitude fewer samples and much less time than conventional sampling schemes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chang:2015:OEE, author = "Lijun Chang and Xuemin Lin and Wenjie Zhang and Jeffrey Xu Yu and Ying Zhang and Lu Qin", title = "Optimal enumeration: efficient top-$k$ tree matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "5", pages = "533--544", month = jan, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Driven by many real applications, graph pattern matching has attracted a great deal of attention recently. Consider that a twig-pattern matching may result in an extremely large number of matches in a graph; this may not only confuse users by providing too many results but also lead to high computational costs. In this paper, we study the problem of top-$k$ tree pattern matching; that is, given a rooted tree $T$, compute its top-$k$ matches in a directed graph $G$ based on the twig-pattern matching semantics. We firstly present a novel and optimal enumeration paradigm based on the principle of Lawler's procedure. We show that our enumeration algorithm runs in $ O(n_T + \log k)$ time in each round where $ n_T$ is the number of nodes in $T$. Considering that the time complexity to output a match of $T$ is $ O(n_T)$ and $ n_T \geq \log k$ in practice, our enumeration technique is optimal. Moreover, the cost of generating top-$1$ match of $T$ in our algorithm is $ O(m_R)$ where $ m_R$ is the number of edges in the transitive closure of a data graph $G$ involving all relevant nodes to $T$. $ O(m_R)$ is also optimal in the worst case without pre-knowledge of $G$. Consequently, our algorithm is optimal with the running time $ O(m_R + k(n_T + \log k))$ in contrast to the time complexity $ O(m_R \log k + k n_T (\log k + d_T))$ of the existing technique where $ d_T$ is the maximal node degree in $T$. Secondly, a novel priority based access technique is proposed, which greatly reduces the number of edges accessed and results in a significant performance improvement. Finally, we apply our techniques to the general form of top-$k$ graph pattern matching problem (i.e., query is a graph) to improve the existing techniques. Comprehensive empirical studies demonstrate that our techniques may improve the existing techniques by orders of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lazerson:2015:MDS, author = "Arnon Lazerson and Izchak Sharfman and Daniel Keren and Assaf Schuster and Minos Garofalakis and Vasilis Samoladas", title = "Monitoring distributed streams using convex decompositions", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "5", pages = "545--556", month = jan, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Emerging large-scale monitoring applications rely on continuous tracking of complex data-analysis queries over collections of massive, physically-distributed data streams. Thus, in addition to the space- and time-efficiency requirements of conventional stream processing (at each remote monitor site), effective solutions also need to guarantee communication efficiency (over the underlying communication network). The complexity of the monitored query adds to the difficulty of the problem --- this is especially true for non-linear queries (e.g., joins), where no obvious solutions exist for distributing the monitored condition across sites. The recently proposed geometric method, based on the notion of covering spheres, offers a generic methodology for splitting an arbitrary (non-linear) global condition into a collection of local site constraints, and has been applied to massive distributed stream-monitoring tasks, achieving state-of-the-art performance. In this paper, we present a far more general geometric approach, based on the convex decomposition of an appropriate subset of the domain of the monitoring query, and formally prove that it is always guaranteed to perform at least as good as the covering spheres method. We analyze our approach and demonstrate its effectiveness for the important case of sketch-based approximate tracking for norm, range-aggregate, and join-aggregate queries, which have numerous applications in streaming data analysis. Experimental results on real-life data streams verify the superiority of our approach in practical settings, showing that it substantially outperforms the covering spheres method.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2015:UGD, author = "Kun Li and Daisy Zhe Wang and Alin Dobra and Christopher Dudley", title = "{UDA}-{GIST}: an in-database framework to unify data-parallel and state-parallel analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "5", pages = "557--568", month = jan, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Enterprise applications need sophisticated in-database analytics in addition to traditional online analytical processing from a database. To meet customers' pressing demands, database vendors have been pushing advanced analytical techniques into databases. Most major DBMSes offer User-Defined Aggregate (UDA), a data-driven operator, to implement many of the analytical techniques in parallel. However, UDAs can not be used to implement statistical algorithms such as Markov chain Monte Carlo (MCMC), where most of the work is performed by iterative transitions over a large state that can not be naively partitioned due to data dependency. Typically, this type of statistical algorithm requires pre-processing to setup the large state in the first place and demands post-processing after the statistical inference. This paper presents General Iterative State Transition (GIST), a new database operator for parallel iterative state transitions over large states. GIST receives a state constructed by a UDA, and then performs rounds of transitions on the state until it converges. A final UDA performs post-processing and result extraction. We argue that the combination of UDA and GIST (UDA-GIST) unifies data-parallel and state-parallel processing in a single system, thus significantly extending the analytical capabilities of DBMSes. We exemplify the framework through two high-profile applications: cross-document coreference and image denoising. We show that the in-database framework allows us to tackle a 27 times larger problem than solved by the state-of-the-art for the first application and achieves 43 times speedup over the state-of-the-art for the second application.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yu:2015:EPP, author = "Weiren Yu and Julie A. McCann", title = "Efficient partial-pairs {SimRank} search on large networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "5", pages = "569--580", month = jan, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The assessment of node-to-node similarities based on graph topology arises in a myriad of applications, e.g., web search. SimRank is a notable measure of this type, with the intuition that ``two nodes are similar if their in-neighbors are similar''. While most existing work retrieving SimRank only considers all-pairs SimRank $ s(*, *) $ and single-source SimRank $ s(*, j) $ (scores between every node and query $j$), there are appealing applications for partial-pairs SimRank, e.g., similarity join. Given two node subsets $A$ and $B$ in a graph, partial-pairs SimRank assessment aims to retrieve only $ \{ s(a, b) \}_{ \forall a \epsilon A, \forall b \epsilon B}$. However, the best-known solution appears not self-contained since it hinges on the premise that the SimRank scores with node-pairs in an $h$-go cover set must be given beforehand. This paper focuses on efficient assessment of partial-pairs SimRank in a self-contained manner. (1) We devise a novel ``seed germination'' model that computes partial-pairs SimRank in $ O(k | E | \{ \min | A |, | B | \})$ time and $ O(| E | + k | V |)$ memory for $k$ iterations on a graph of $ | V |$ nodes and $ | E |$ edges. (2) We further eliminate unnecessary edge access to improve the time of partial-pairs SimRank to $ O(m \{ \min | A |, | B | \})$, where $ m \geq \{ \min k | E |, \Delta^{2 k} \} $, and $ \Delta $ is the maximum degree. (3) We show that our partial-pairs SimRank model also can handle the computations of all-pairs and single-source SimRanks. (4) We empirically verify that our algorithms are (a) $ 38 \times $ faster than the best-known competitors, and (b) memory-efficient, allowing scores to be assessed accurately on graphs with tens of millions of links.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gatterbauer:2015:LSP, author = "Wolfgang Gatterbauer and Stephan G{\"u}nnemann and Danai Koutra and Christos Faloutsos", title = "Linearized and single-pass belief propagation", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "5", pages = "581--592", month = jan, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "How can we tell when accounts are fake or real in a social network? And how can we tell which accounts belong to liberal, conservative or centrist users? Often, we can answer such questions and label nodes in a network based on the labels of their neighbors and appropriate assumptions of homophily (``birds of a feather flock together'') or heterophily (``opposites attract''). One of the most widely used methods for this kind of inference is Belief Propagation (BP) which iteratively propagates the information from a few nodes with explicit labels throughout a network until convergence. A well-known problem with BP, however, is that there are no known exact guarantees of convergence in graphs with loops. This paper introduces Linearized Belief Propagation (LinBP), a linearization of BP that allows a closed-form solution via intuitive matrix equations and, thus, comes with exact convergence guarantees. It handles homophily, heterophily, and more general cases that arise in multi-class settings. Plus, it allows a compact implementation in SQL. The paper also introduces Single-pass Belief Propagation (SBP), a localized (or ``myopic'') version of LinBP that propagates information across every edge at most once and for which the final class assignments depend only on the nearest labeled neighbors. In addition, SBP allows fast incremental updates in dynamic networks. Our runtime experiments show that LinBP and SBP are orders of magnitude faster than standard BP, while leading to almost identical node labels.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Do:2015:MRM, author = "Loc Do and Hady W. Lauw and Ke Wang", title = "Mining revenue-maximizing bundling configuration", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "5", pages = "593--604", month = jan, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With greater prevalence of social media, there is an increasing amount of user-generated data revealing consumer preferences for various products and services. Businesses seek to harness this wealth of data to improve their marketing strategies. Bundling, or selling two or more items for one price is a highly-practiced marketing strategy. In this paper, we address the bundle configuration problem from the data-driven perspective. Given a set of items in a seller's inventory, we seek to determine which items should belong to which bundle so as to maximize the total revenue, by mining consumer preferences data. We show that this problem is NP-hard when bundles are allowed to contain more than two items. Therefore, we describe an optimal solution for bundle sizes up to two items, and propose two heuristic solutions for bundles of any larger size. We investigate the effectiveness and the efficiency of the proposed algorithms through experimentations on real-life rating-based preferences data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2015:RKN, author = "Shiyu Yang and Muhammad Aamir Cheema and Xuemin Lin and Wei Wang", title = "Reverse $k$ nearest neighbors query processing: experiments and analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "5", pages = "605--616", month = jan, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a set of users, a set of facilities and a query facility $q$, a reverse $k$ nearest neighbors (R $k$ NN) query returns every user u for which the query is one of its $k$ closest facilities. R $k$ NN queries have been extensively studied under a variety of settings and many sophisticated algorithms have been proposed to answer these queries. However, the existing experimental studies suffer from a few limitations. For example, some studies estimate the I/O cost by charging a fixed penalty per I/O and we show that this may be misleading. Also, the existing studies either use an extremely small buffer or no buffer at all which puts some algorithms at serious disadvantage. We show that the performance of these algorithms is significantly improved even when a small buffer (containing 100 pages) is used. Finally, in each of the existing studies, the proposed algorithm is mainly compared only with its predecessor assuming that it was the best algorithm at the time which is not necessarily true as shown in our experimental study. Motivated by these limitations, we present a comprehensive experimental study that addresses these limitations and compares some of the most notable algorithms under a wide variety of settings. Furthermore, we also present a carefully developed filtering strategy that significantly improves TPL which is one of the most popular R $k$ NN algorithms. Specifically, the optimized version is up to 20 times faster than the original version and reduces its I/O cost up to two times.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ren:2015:EVR, author = "Xuguang Ren and Junhu Wang", title = "Exploiting vertex relationships in speeding up subgraph isomorphism over large graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "5", pages = "617--628", month = jan, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Subgraph Isomorphism is a fundamental problem in graph data processing. Most existing subgraph isomorphism algorithms are based on a backtracking framework which computes the solutions by incrementally matching all query vertices to candidate data vertices. However, we observe that extensive duplicate computation exists in these algorithms, and such duplicate computation can be avoided by exploiting relationships between data vertices. Motivated by this, we propose a novel approach, BoostIso, to reduce duplicate computation. Our extensive experiments with real datasets show that, after integrating our approach, most existing subgraph isomorphism algorithms can be speeded up significantly, especially for some graphs with intensive vertex relationships, where the improvement can be up to several orders of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gatterbauer:2015:ALI, author = "Wolfgang Gatterbauer and Dan Suciu", title = "Approximate lifted inference with probabilistic databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "5", pages = "629--640", month = jan, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper proposes a new approach for approximate evaluation of \#P-hard queries with probabilistic databases. In our approach, every query is evaluated entirely in the database engine by evaluating a fixed number of query plans, each providing an upper bound on the true probability, then taking their minimum. We provide an algorithm that takes into account important schema information to enumerate only the minimal necessary plans among all possible plans. Importantly, this algorithm is a strict generalization of all known results of PTIME self-join-free conjunctive queries: A query is safe if and only if our algorithm returns one single plan. We also apply three relational query optimization techniques to evaluate all minimal safe plans very fast. We give a detailed experimental evaluation of our approach and, in the process, provide a new way of thinking about the value of probabilistic methods over non-probabilistic methods for ranking query answers.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Vesdapunt:2015:ECA, author = "Norases Vesdapunt and Kedar Bellare and Nilesh Dalvi", title = "Errata for {``Crowdsourcing algorithms for entity resolution''}: {(PVLDB {\bf 7}(12): 1071--1082)}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "5", pages = "641--641", month = jan, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:35 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We discovered that there was a duplicate figure in our paper. We accidentally put Figure 13(b) for Figure 12(b). We have provided the correct Figure 12(b) above (See Figure 1). Figure 1 plots the recall of various strategies as a function of the number of questions asked for Places dataset. There was no error in the discussion in our paper (See Section 6.2.1 in our paper for more details).", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jha:2015:IMM, author = "Saurabh Jha and Bingsheng He and Mian Lu and Xuntao Cheng and Huynh Phung Huynh", title = "Improving main memory hash joins on {Intel Xeon Phi} processors: an experimental approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "6", pages = "642--653", month = feb, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 10 17:42:37 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern processor technologies have driven new designs and implementations in main-memory hash joins. Recently, Intel Many Integrated Core (MIC) co-processors (commonly known as Xeon Phi) embrace emerging x86 single-chip many-core techniques. Compared with contemporary multi-core CPUs, Xeon Phi has quite different architectural features: wider SIMD instructions, many cores and hardware contexts, as well as lower-frequency in-order cores. In this paper, we experimentally revisit the state-of-the-art hash join algorithms on Xeon Phi co-processors. In particular, we study two camps of hash join algorithms: hardware-conscious ones that advocate careful tailoring of the join algorithms to underlying hardware architectures and hardware-oblivious ones that omit such careful tailoring. For each camp, we study the impact of architectural features and software optimizations on Xeon Phi in comparison with results on multi-core CPUs. Our experiments show two major findings on Xeon Phi, which are quantitatively different from those on multi-core CPUs. First, the impact of architectural features and software optimizations has quite different behavior on Xeon Phi in comparison with those on the CPU, which calls for new optimization and tuning on Xeon Phi. Second, hardware oblivious algorithms can outperform hardware conscious algorithms on a wide parameter window. These two findings further shed light on the design and implementation of query processing on new-generation single-chip many-core technologies.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hammoud:2015:DDR, author = "Mohammad Hammoud and Dania Abed Rabbou and Reza Nouri and Seyed-Mehdi-Reza Beheshti and Sherif Sakr", title = "{DREAM}: distributed {RDF} engine with adaptive query planner and minimal communication", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "6", pages = "654--665", month = feb, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 10 17:42:37 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Resource Description Framework (RDF) and SPARQL query language are gaining wide popularity and acceptance. In this paper, we present DREAM, a distributed and adaptive RDF system. As opposed to existing RDF systems, DREAM avoids partitioning RDF datasets and partitions only SPARQL queries. By not partitioning datasets, DREAM offers a general paradigm for different types of pattern matching queries, and entirely averts intermediate data shuffling (only auxiliary data are shuffled). Besides, by partitioning queries, DREAM presents an adaptive scheme, which automatically runs queries on various numbers of machines depending on their complexities. Hence, in essence DREAM combines the advantages of the state-of-the-art centralized and distributed RDF systems, whereby data communication is avoided and cluster resources are aggregated. Likewise, it precludes their disadvantages, wherein system resources are limited and communication overhead is typically hindering. DREAM achieves all its goals via employing a novel graph-based, rule-oriented query planner and a new cost model. We implemented DREAM and conducted comprehensive experiments on a private cluster and on the Amazon EC2 platform. Results show that DREAM can significantly outperform three related popular RDF systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2015:OTA, author = "Shuo Chen and Ju Fan and Guoliang Li and Jianhua Feng and Kian-lee Tan and Jinhui Tang", title = "Online topic-aware influence maximization", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "6", pages = "666--677", month = feb, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 10 17:42:37 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Influence maximization, whose objective is to select $k$ users (called seeds) from a social network such that the number of users influenced by the seeds (called influence spread) is maximized, has attracted significant attention due to its widespread applications, such as viral marketing and rumor control. However, in real-world social networks, users have their own interests (which can be represented as topics) and are more likely to be influenced by their friends (or friends' friends) with similar topics. We can increase the influence spread by taking into consideration topics. To address this problem, we study topic-aware influence maximization, which, given a topic-aware influence maximization (TIM) query, finds $k$ seeds from a social network such that the topic-aware influence spread of the $k$ seeds is maximized. Our goal is to enable online TIM queries. Since the topic-aware influence maximization problem is NP-hard, we focus on devising efficient algorithms to achieve instant performance while keeping a high influence spread. We utilize a maximum influence arborescence (MIA) model to approximate the computation of influence spread. To efficiently find $k$ seeds under the MIA model, we first propose a best-effort algorithm with $ 1 - 1 / e$ approximation ratio, which estimates an upper bound of the topic-aware influence of each user and utilizes the bound to prune large numbers of users with small influence. We devise effective techniques to estimate tighter upper bounds. We then propose a faster topic-sample-based algorithm with $ \epsilon \cdot (1 - 1 / e)$ approximation ratio for any $ \epsilon \in (0, 1]$, which materializes the influence spread of some topic-distribution samples and utilizes the materialized information to avoid computing the actual influence of users with small influences. Experimental results show that our methods significantly outperform baseline approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nazi:2015:WWF, author = "Azade Nazi and Zhuojie Zhou and Saravanan Thirumuruganathan and Nan Zhang and Gautam Das", title = "Walk, not wait: faster sampling over online social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "6", pages = "678--689", month = feb, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 10 17:42:37 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we introduce a novel, general purpose, technique for faster sampling of nodes over an online social network. Specifically, unlike traditional random walks which wait for the convergence of sampling distribution to a predetermined target distribution --- a waiting process that incurs a high query cost --- we develop WALK-ESTIMATE, which starts with a much shorter random walk, and then proactively estimate the sampling probability for the node taken before using acceptance--rejection sampling to adjust the sampling probability to the predetermined target distribution. We present a novel backward random walk technique which provides provably unbiased estimations for the sampling probability, and demonstrate the superiority of WALK-ESTIMATE over traditional random walks through theoretical analysis and extensive experiments over real world online social networks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Benedikt:2015:QAP, author = "Michael Benedikt and Julien Leblay and Efthymia Tsamoura", title = "Querying with access patterns and integrity constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "6", pages = "690--701", month = feb, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 10 17:42:37 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Traditional query processing involves a search for plans formed by applying algebraic operators on top of primitives representing access to relations in the input query. But many querying scenarios involve two interacting issues that complicate the search. On the one hand, the search space may be limited by access restrictions associated with the interfaces to datasources, which require certain parameters to be given as inputs. On the other hand, the search space may be extended through the presence of integrity constraints that relate sources to each other, allowing for plans that do not match the structure of the user query. In this paper we present the first optimization approach that attacks both these difficulties within a single framework, presenting a system in which classical cost-based join optimization is extended to support both access-restrictions and constraints. Instead of iteratively exploring subqueries of the input query, our optimizer explores a space of proofs that witness the answering of the query, where each proof has a direct correspondence with a query plan.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tangwongsan:2015:GIS, author = "Kanat Tangwongsan and Martin Hirzel and Scott Schneider and Kun-Lung Wu", title = "General incremental sliding-window aggregation", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "7", pages = "702--713", month = feb, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:04:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Stream processing is gaining importance as more data becomes available in the form of continuous streams and companies compete to promptly extract insights from them. In such applications, sliding-window aggregation is a central operator, and incremental aggregation helps avoid the performance penalty of re-aggregating from scratch for each window change. This paper presents Reactive Aggregator (RA), a new framework for incremental sliding-window aggregation. RA is general in that it does not require aggregation functions to be invertible or commutative, and it does not require windows to be FIFO. We implemented RA as a drop-in replacement for the Aggregate operator of a commercial streaming engine. Given m updates on a window of size $n$, RA has an algorithmic complexity of $ O(m + m \log (n / m))$, rivaling the best prior algorithms for any $m$. Furthermore, RA's implementation minimizes overheads from allocation and pointer traversals by using a single flat array.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lei:2015:SER, author = "Chuan Lei and Zhongfang Zhuang and Elke A. Rundensteiner and Mohamed Eltabakh", title = "Shared execution of recurring workloads in {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "7", pages = "714--725", month = feb, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:04:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the increasing complexity of data-intensive MapReduce workloads, Hadoop must often accommodate hundreds or even thousands of recurring analytics queries that periodically execute over frequently updated datasets, e.g., latest stock transactions, new log files, or recent news feeds. For many applications, such recurring queries come with user-specified service-level agreements (SLAs), commonly expressed as the maximum allowed latency for producing results before their merits decay. The recurring nature of these emerging workloads combined with their SLA constraints make it challenging to share and optimize their execution. While some recent efforts on multi-job optimization in MapReduce have emerged, they focus on only sharing work among ad-hoc jobs on static datasets. Unfortunately, these sharing techniques neither take the recurring nature of the queries into account nor guarantee the satisfaction of the SLA requirements. In this work, we propose the first scalable multi-query sharing engine tailored for recurring workloads in the MapReduce infrastructure, called ``Helix''. Helix deploys new sliced window-alignment techniques to create sharing opportunities among recurring queries without introducing additional I/O overheads or unnecessary data scans. And then, Helix introduces a cost/benefit model for creating a sharing plan among the recurring queries, and a scheduling strategy for executing them to maximize the SLA satisfaction. Our experimental results over real-world datasets confirm that Helix significantly outperforms the state-of-art techniques by an order of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Narasayya:2015:SBP, author = "Vivek Narasayya and Ishai Menache and Mohit Singh and Feng Li and Manoj Syamala and Surajit Chaudhuri", title = "Sharing buffer pool memory in multi-tenant relational database-as-a-service", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "7", pages = "726--737", month = feb, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:04:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Relational database-as-a-service (DaaS) providers need to rely on multi-tenancy and resource sharing among tenants, since statically reserving resources for a tenant is not cost effective. A major consequence of resource sharing is that the performance of one tenant can be adversely affected by resource demands of other co-located tenants. One such resource that is essential for good performance of a tenant's workload is buffer pool memory. In this paper, we study the problem of how to effectively share buffer pool memory in multi-tenant relational DaaS. We first develop an SLA framework that defines and enforces accountability of the service provider to the tenant even when buffer pool memory is not statically reserved on behalf of the tenant. Next, we present a novel buffer pool page replacement algorithm (MT-LRU) that builds upon theoretical concepts from weighted online caching, and is designed for multi-tenant scenarios involving SLAs and overbooking. MT-LRU generalizes the LRU-K algorithm which is commonly used in relational database systems. We have prototyped our techniques inside a commercial DaaS engine and extensive experiments demonstrate the effectiveness of our solution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gao:2015:AWQ, author = "Yunjun Gao and Qing Liu and Gang Chen and Baihua Zheng and Linlin Zhou", title = "Answering why-not questions on reverse top-$k$ queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "7", pages = "738--749", month = feb, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:04:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Why-not questions, which aim to seek clarifications on the missing tuples for query results, have recently received considerable attention from the database community. In this paper, we systematically explore why-not questions on reverse top-$k$ queries, owing to its importance in multi-criteria decision making. Given an initial reverse top-$k$ query and a missing/why-not weighting vector set W$_m$ that is absent from the query result, why-not questions on reverse top-$k$ queries explain why W$_m$ does not appear in the query result and provide suggestions on how to refine the initial query with minimum penalty to include W$_m$ in the refined query result. We first formalize why-not questions on reverse top-$k$ queries and reveal their semantics, and then propose a unified framework called WQRTQ to answer why-not questions on both monochromatic and bichromatic reverse top-$k$ queries. Our framework offers three solutions, namely, (i) modifying a query point $q$, (ii) modifying a why-not weighting vector set W$_m$ and a parameter $k$, and (iii) modifying $q$, W$_m$, and $k$ simultaneously, to cater for different application scenarios. Extensive experimental evaluation using both real and synthetic data sets verifies the effectiveness and efficiency of the presented algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Papadopoulos:2015:PAP, author = "Dimitrios Papadopoulos and Charalampos Papamanthou and Roberto Tamassia and Nikos Triandopoulos", title = "Practical authenticated pattern matching with optimal proof size", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "7", pages = "750--761", month = feb, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:04:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/string-matching.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We address the problem of authenticating pattern matching queries over textual data that is outsourced to an untrusted cloud server. By employing cryptographic accumulators in a novel optimal integrity-checking tool built directly over a suffix tree, we design the first authenticated data structure for verifiable answers to pattern matching queries featuring fast generation of constant-size proofs. We present two main applications of our new construction to authenticate: (i) pattern matching queries over text documents, and (ii) exact path queries over XML documents. Answers to queries are verified by proofs of size at most 500 bytes for text pattern matching, and at most 243 bytes for exact path XML search, independently of the document or answer size. By design, our authentication schemes can also be parallelized to offer extra efficiency during data outsourcing. We provide a detailed experimental evaluation of our schemes showing that for both applications the times required to compute and verify a proof are very small --- e.g., it takes less than $ 10 \mu $ s to generate a proof for a pattern (mis)match of $ 10^2 $ characters in a text of $ 10^6 $ characters, once the query has been evaluated.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Loghin:2015:PSB, author = "Dumitrel Loghin and Bogdan Marius Tudor and Hao Zhang and Beng Chin Ooi and Yong Meng Teo", title = "A performance study of big data on small nodes", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "7", pages = "762--773", month = feb, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:04:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The continuous increase in volume, variety and velocity of Big Data exposes datacenter resource scaling to an energy utilization problem. Traditionally, datacenters employ x86-64 (big) server nodes with power usage of tens to hundreds of Watts. But lately, low-power (small) systems originally developed for mobile devices have seen significant improvements in performance. These improvements could lead to the adoption of such small systems in servers, as announced by major industry players. In this context, we systematically conduct a performance study of Big Data execution on small nodes in comparison with traditional big nodes, and present insights that would be useful for future development. We run Hadoop MapReduce, MySQL and in-memory Shark workloads on clusters of ARM big. LITTLE boards and Intel Xeon server systems. We evaluate execution time, energy usage and total cost of running the workloads on self-hosted ARM and Xeon nodes. Our study shows that there is no one size fits all rule for judging the efficiency of executing Big Data workloads on small and big nodes. But small memory size, low memory and I/O bandwidths, and software immaturity concur in canceling the lower-power advantage of ARM servers. We show that I/O-intensive MapReduce workloads are more energy-efficient to run on Xeon nodes. In contrast, database query processing is always more energy-efficient on ARM servers, at the cost of slightly lower throughput. With minor software modifications, CPU-intensive MapReduce workloads are almost four times cheaper to execute on ARM servers.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Papenbrock:2015:DCB, author = "Thorsten Papenbrock and Sebastian Kruse and Jorge-Arnulfo Quian{\'e}-Ruiz and Felix Naumann", title = "Divide \& conquer-based inclusion dependency discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "7", pages = "774--785", month = feb, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:04:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The discovery of all inclusion dependencies (INDs) in a dataset is an important part of any data profiling effort. Apart from the detection of foreign key relationships, INDs can help to perform data integration, query optimization, integrity checking, or schema (re-)design. However, the detection of INDs gets harder as datasets become larger in terms of number of tuples as well as attributes. To this end, we propose Binder, an IND detection system that is capable of detecting both unary and $n$-ary INDs. It is based on a divide \& conquer approach, which allows to handle very large datasets --- an important property on the face of the ever increasing size of today's data. In contrast to most related works, we do not rely on existing database functionality nor assume that inspected datasets fit into main memory. This renders Binder an efficient and scalable competitor. Our exhaustive experimental evaluation shows the high superiority of Binder over the state-of-the-art in both unary (Spider) and $n$-ary (Mind) IND discovery. Binder is up to $ 26 \times $ faster than Spider and more than $ 2500 \times $ faster than Mind.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2015:PBT, author = "Shimin Chen and Qin Jin", title = "Persistent {B+}-trees in non-volatile main memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "7", pages = "786--797", month = feb, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:04:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Computer systems in the near future are expected to have Non-Volatile Main Memory (NVMM), enabled by a new generation of Non-Volatile Memory (NVM) technologies, such as Phase Change Memory (PCM), STT-MRAM, and Memristor. The non-volatility property has the promise to persist in-memory data structures for instantaneous failure recovery. However, realizing such promise requires a careful design to ensure that in-memory data structures are in known consistent states after failures. This paper studies persistent in-memory $ B^+$-Trees as $ B^+$-Trees are widely used in database and data-intensive systems. While traditional techniques, such as undo-redo logging and shadowing, support persistent $ B^+$-Trees, we find that they incur drastic performance overhead because of extensive NVM writes and CPU cache flush operations. PCM-friendly $ B^+$-Trees with unsorted leaf nodes help mediate this issue, but the remaining overhead is still large. In this paper, we propose write atomic $ B^+$-Trees (w$ B^+$-Trees), a new type of main-memory $ B^+$-Trees, that aim to reduce such overhead as much as possible. $ w B^+$-Tree nodes employ a small indirect slot array and/or a bitmap so that most insertions and deletions do not require the movement of index entries. In this way, $ w B^+$-Trees can achieve node consistency either through atomic writes in the nodes or by redo-only logging. We model fast NVM using DRAM on a real machine and model PCM using a cycle-accurate simulator. Experimental results show that compared with previous persistent $ B^+$-Tree solutions, $ w B^+$-Trees achieve up to $ 8.8 \times $ speedups on DRAM-like fast NVM and up to $ 27.1 \times $ speedups on PCM for insertions and deletions while maintaining good search performance. Moreover, we replaced Memcached's internal hash index with tree indices. Our real machine Memcached experiments show that $ w B^+$-Trees achieve up to 3.8X improvements over previous persistent tree structures with undo-redo logging or shadowing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2015:RLC, author = "Yubao Wu and Ruoming Jin and Jing Li and Xiang Zhang", title = "Robust local community detection: on free rider effect and its elimination", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "7", pages = "798--809", month = feb, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:04:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a large network, local community detection aims at finding the community that contains a set of query nodes and also maximizes (minimizes) a goodness metric. This problem has recently drawn intense research interest. Various goodness metrics have been proposed. However, most existing metrics tend to include irrelevant subgraphs in the detected local community. We refer to such irrelevant subgraphs as free riders. We systematically study the existing goodness metrics and provide theoretical explanations on why they may cause the free rider effect. We further develop a query biased node weighting scheme to reduce the free rider effect. In particular, each node is weighted by its proximity to the query node. We define a query biased density metric to integrate the edge and node weights. The query biased densest subgraph, which has the largest query biased density, will shift to the neighborhood of the query nodes after node weighting. We then formulate the query biased densest connected subgraph (QDC) problem, study its complexity, and provide efficient algorithms to solve it. We perform extensive experiments on a variety of real and synthetic networks to evaluate the effectiveness and efficiency of the proposed methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2015:UCC, author = "Hua Fan and Aditya Ramaraju and Marlon McKenzie and Wojciech Golab and Bernard Wong", title = "Understanding the causes of consistency anomalies in {Apache Cassandra}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "7", pages = "810--813", month = feb, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:04:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A recent paper on benchmarking eventual consistency showed that when a constant workload is applied against Cassandra, the staleness of values returned by read operations exhibits interesting but unexplained variations when plotted against time. In this paper we reproduce this phenomenon and investigate in greater depth the low-level mechanisms that give rise to stale reads. We show that the staleness spikes exhibited by Cassandra are strongly correlated with garbage collection, particularly the ``stop-the-world'' phase which pauses all application threads in a Java virtual machine. We show experimentally that the staleness spikes can be virtually eliminated by delaying read operations artificially at servers immediately after a garbage collection pause. In our experiments this yields more than a 98\% reduction in the number of consistency anomalies that exceed 5ms, and has negligible impact on throughput and latency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Aslay:2015:VMM, author = "Cigdem Aslay and Wei Lu and Francesco Bonchi and Amit Goyal and Laks V. S. Lakshmanan", title = "Viral marketing meets social advertising: ad allocation with minimum regret", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "7", pages = "814--825", month = feb, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:04:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Social advertisement is one of the fastest growing sectors in the digital advertisement landscape: ads in the form of promoted posts are shown in the feed of users of a social networking platform, along with normal social posts; if a user clicks on a promoted post, the host (social network owner) is paid a fixed amount from the advertiser. In this context, allocating ads to users is typically performed by maximizing click-through-rate, i.e., the likelihood that the user will click on the ad. However, this simple strategy fails to leverage the fact the ads can propagate virally through the network, from endorsing users to their followers. In this paper, we study the problem of allocating ads to users through the viral-marketing lenses. We show that allocation that takes into account the propensity of ads for viral propagation can achieve significantly better performance. However, uncontrolled virality could be undesirable for the host as it creates room for exploitation by the advertisers: hoping to tap uncontrolled virality, an advertiser might declare a lower budget for its marketing campaign, aiming at the same large outcome with a smaller cost. This creates a challenging trade-off: on the one hand, the host aims at leveraging virality and the network effect to improve advertising efficacy, while on the other hand the host wants to avoid giving away free service due to uncontrolled virality. We formalize this as the problem of ad allocation with minimum regret, which we show is NP-hard and inapproximable w.r.t. any factor. However, we devise an algorithm that provides approximation guarantees w.r.t. the total budget of all advertisers. We develop a scalable version of our approximation algorithm, which we extensively test on four real-world data sets, confirming that our algorithm delivers high quality solutions, is scalable, and significantly outperforms several natural baselines.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chu:2015:ASD, author = "Lingyang Chu and Shuhui Wang and Siyuan Liu and Qingming Huang and Jian Pei", title = "{ALID}: scalable dominant cluster detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "8", pages = "826--837", month = apr, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:02:29 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Detecting dominant clusters is important in many analytic applications. The state-of-the-art methods find dense subgraphs on the affinity graph as dominant clusters. However, the time and space complexities of those methods are dominated by the construction of affinity graph, which is quadratic with respect to the number of data points, and thus are impractical on large data sets. To tackle the challenge, in this paper, we apply Evolutionary Game Theory (EGT) and develop a scalable algorithm, Approximate Localized Infection Immunization Dynamics (ALID). The major idea is to perform Localized Infection Immunization Dynamics (LID) to find dense subgraphs within local ranges of the affinity graph. LID is further scaled up with guaranteed high efficiency and detection quality by an estimated Region of Interest (ROI) and a Candidate Infective Vertex Search method (CIVS). ALID only constructs small local affinity graphs and has time complexity $ O(C(a^* + \delta) n) $ and space complexity $ O(a^*(a^* + \delta)) $, where $ a^* $ is the size of the largest dominant cluster, and $ C \ll n $ and $ \delta \ll n $ are small constants. We demonstrate by extensive experiments on both synthetic data and real world data that ALID achieves the state-of-the-art detection quality with much lower time and space cost on single machine. We also demonstrate the encouraging parallelization performance of ALID by implementing the Parallel ALID (PALID) on Apache Spark. PALID processes 50 million SIFT data points in 2.29 hours, achieving a speedup ratio of 7.51 with 8 executors.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shao:2015:ESS, author = "Yingxia Shao and Bin Cui and Lei Chen and Mingming Liu and Xing Xie", title = "An efficient similarity search framework for {SimRank} over large dynamic graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "8", pages = "838--849", month = apr, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:02:29 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "SimRank is an important measure of vertex-pair similarity according to the structure of graphs. The similarity search based on SimRank is an important operation for identifying similar vertices in a graph and has been employed in many data analysis applications. Nowadays, graphs in the real world become much larger and more dynamic. The existing solutions for similarity search are expensive in terms of time and space cost. None of them can efficiently support similarity search over large dynamic graphs. In this paper, we propose a novel two-stage random-walk sampling framework (TSF) for SimRank-based similarity search (e.g., top-$k$ search). In the preprocessing stage, TSF samples a set of one-way graphs to index raw random walks in a novel manner within $ O(N R_g)$ time and space, where $N$ is the number of vertices and $ R_g$ is the number of one-way graphs. The one-way graph can be efficiently updated in accordance with the graph modification, thus TSF is well suited to dynamic graphs. During the query stage, TSF can search similar vertices fast by naturally pruning unqualified vertices based on the connectivity of one-way graphs. Furthermore, with additional $ R_q$ samples, TSF can estimate the SimRank score with probability [EQUATION] if the error of approximation is bounded by $ 1 - \epsilon $. Finally, to guarantee the scalability of TSF, the one-way graphs can also be compactly stored on the disk when the memory is limited. Extensive experiments have demonstrated that TSF can handle dynamic billion-edge graphs with high performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ahmad:2015:CMD, author = "Muhammad Yousuf Ahmad and Bettina Kemme", title = "Compaction management in distributed key--value datastores", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "8", pages = "850--861", month = apr, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:02:29 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Compactions are a vital maintenance mechanism used by datastores based on the log-structured merge-tree to counter the continuous buildup of data files under update-intensive workloads. While compactions help keep read latencies in check over the long run, this comes at the cost of significantly degraded read performance over the course of the compaction itself. In this paper, we offer an in-depth analysis of compaction-related performance overheads and propose techniques for their mitigation. We offload large, expensive compactions to a dedicated compaction server to allow the datastore server to better utilize its resources towards serving the actual workload. Moreover, since the newly compacted data is already cached in the compaction server's main memory, we fetch this data over the network directly into the datastore server's local cache, thereby avoiding the performance penalty of reading it back from the filesystem. In fact, pre-fetching the compacted data from the remote cache prior to switching the workload over to it can eliminate local cache misses altogether. Therefore, we implement a smarter warmup algorithm that ensures that all incoming read requests are served from the datastore server's local cache even as it is warming up. We have integrated our solution into HBase, and using the YCSB and TPC-C benchmarks, we show that our approach significantly mitigates compaction-related performance problems. We also demonstrate the scalability of our solution by distributing compactions across multiple compaction servers.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Guerraoui:2015:DPD, author = "Rachid Guerraoui and Anne-Marie Kermarrec and Rhicheek Patra and Mahsa Taziki", title = "{D2P}: distance-based differential privacy in recommenders", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "8", pages = "862--873", month = apr, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:02:29 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The upsurge in the number of web users over the last two decades has resulted in a significant growth of online information. This information growth calls for recommenders that personalize the information proposed to each individual user. Nevertheless, personalization also opens major privacy concerns. This paper presents D2P, a novel protocol that ensures a strong form of differential privacy, which we call distance-based differential privacy, and which is particularly well suited to recommenders. D2P avoids revealing exact user profiles by creating altered profiles where each item is replaced with another one at some distance. We evaluate D2P analytically and experimentally on MovieLens and Jester datasets and compare it with other private and non-private recommenders.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mitliagkas:2015:FFP, author = "Ioannis Mitliagkas and Michael Borokhovich and Alexandros G. Dimakis and Constantine Caramanis", title = "{FrogWild!}: fast {PageRank} approximations on graph engines", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "8", pages = "874--885", month = apr, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:02:29 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/pagerank.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose FrogWild, a novel algorithm for fast approximation of high PageRank vertices, geared towards reducing network costs of running traditional PageRank algorithms. Our algorithm can be seen as a quantized version of power iteration that performs multiple parallel random walks over a directed graph. One important innovation is that we introduce a modification to the GraphLab framework that only partially synchronizes mirror vertices. This partial synchronization vastly reduces the network traffic generated by traditional PageRank algorithms, thus greatly reducing the per-iteration cost of PageRank. On the other hand, this partial synchronization also creates dependencies between the random walks used to estimate PageRank. Our main theoretical innovation is the analysis of the correlations introduced by this partial synchronization process and a bound establishing that our approximation is close to the true PageRank vector. We implement our algorithm in GraphLab and compare it against the default PageRank implementation. We show that our algorithm is very fast, performing each iteration in less than one second on the Twitter graph and can be up to $ 7 \times $ faster compared to the standard GraphLab PageRank implementation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Vattani:2015:OPC, author = "Andrea Vattani and Flavio Chierichetti and Keegan Lowenstein", title = "Optimal probabilistic cache stampede prevention", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "8", pages = "886--897", month = apr, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Apr 15 19:02:29 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "When a frequently-accessed cache item expires, multiple requests to that item can trigger a cache miss and start regenerating that same item at the same time. This phenomenon, known as cache stampede, severely limits the performance of databases and web servers. A natural countermeasure to this issue is to let the processes that perform such requests to randomly ask for a regeneration before the expiration time of the item. In this paper we give optimal algorithms for performing such probabilistic early expirations. Our algorithms are theoretically optimal and have much better performances than other solutions used in real-world applications.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Potti:2015:DNP, author = "Navneet Potti and Jignesh M. Patel", title = "{DAQ}: a new paradigm for approximate query processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "9", pages = "898--909", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2777598.2777599", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 15 17:15:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many modern applications deal with exponentially increasing data volumes and aid business-critical decisions in near real-time. Particularly in exploratory data analysis, the focus is on interactive querying and some degree of error in estimated results is tolerable. A common response to this challenge is approximate query processing, where the user is presented with a quick confidence interval estimate based on a sample of the data. In this work, we highlight some of the problems that are associated with this probabilistic approach when extended to more complex queries, both in semantic interpretation and the lack of a formal algebra. As an alternative, we propose deterministic approximate querying (DAQ) schemes, formalize a closed deterministic approximation algebra, and outline some design principles for DAQ schemes. We also illustrate the utility of this approach with an example deterministic online approximation scheme which uses a bitsliced index representation and computes the most significant bits of the result first. Our prototype scheme delivers speedups over exact aggregation and predicate evaluation, and outperforms sampling-based schemes for extreme value aggregations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Anciaux:2015:SSE, author = "Nicolas Anciaux and Saliha Lallali and Iulian Sandu Popa and Philippe Pucheral", title = "A scalable search engine for mass storage smart objects", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "9", pages = "910--921", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2777598.2777600", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 15 17:15:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents a new embedded search engine designed for smart objects. Such devices are generally equipped with extremely low RAM and large Flash storage capacity. To tackle these conflicting hardware constraints, conventional search engines privilege either insertion or query scalability but cannot meet both requirements at the same time. Moreover, very few solutions support document deletions and updates in this context. In this paper, we introduce three design principles, namely Write-Once Partitioning, Linear Pipelining and Background Linear Merging, and show how they can be combined to produce an embedded search engine reconciling high insert\slash delete\slash update rate and query scalability. We have implemented our search engine on a development board having a hardware configuration representative for smart objects and have conducted extensive experiments using two representative datasets. The experimental results demonstrate the scalability of the approach and its superiority compared to state of the art methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2015:SMD, author = "Lanjun Wang and Shuo Zhang and Juwei Shi and Limei Jiao and Oktie Hassanzadeh and Jia Zou and Chen Wangz", title = "Schema management for document stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "9", pages = "922--933", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2777598.2777601", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 15 17:15:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Document stores that provide the efficiency of a schema-less interface are widely used by developers in mobile and cloud applications. However, the simplicity developers achieved controversially leads to complexity for data management due to lack of a schema. In this paper, we present a schema management framework for document stores. This framework discovers and persists schemas of JSON records in a repository, and also supports queries and schema summarization. The major technical challenge comes from varied structures of records caused by the schema-less data model and schema evolution. In the discovery phase, we apply a canonical form based method and propose an algorithm based on equivalent sub-trees to group equivalent schemas efficiently. Together with the algorithm, we propose a new data structure, eSiBu-Tree, to store schemas and support queries. In order to present a single summarized representation for heterogeneous schemas in records, we introduce the concept of ``skeleton'', and propose to use it as a relaxed form of the schema, which captures a small set of core attributes. Finally, extensive experiments based on real data sets demonstrate the efficiency of our proposed schema discovery algorithms, and practical use cases in real-world data exploration and integration scenarios are presented to illustrate the effectiveness of using skeletons in these applications.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Schuhknecht:2015:SDS, author = "Felix Martin Schuhknecht and Pankaj Khanchandani and Jens Dittrich", title = "On the surprising difficulty of simple things: the case of radix partitioning", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "9", pages = "934--937", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2777598.2777602", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 15 17:15:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Partitioning a dataset into ranges is a task that is common in various applications such as sorting [1,6,7,8,9] and hashing [3] which are in turn building blocks for almost any type of query processing. Especially radix-based partitioning is very popular due to its simplicity and high performance over comparison-based versions [6].", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dong:2015:KBT, author = "Xin Luna Dong and Evgeniy Gabrilovich and Kevin Murphy and Van Dang and Wilko Horn and Camillo Lugaresi and Shaohua Sun and Wei Zhang", title = "Knowledge-based trust: estimating the trustworthiness of web sources", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "9", pages = "938--949", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2777598.2777603", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 15 17:15:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The quality of web sources has been traditionally evaluated using exogenous signals such as the hyperlink structure of the graph. We propose a new approach that relies on endogenous signals, namely, the correctness of factual information provided by the source. A source that has few false facts is considered to be trustworthy. The facts are automatically extracted from each source by information extraction methods commonly used to construct knowledge bases. We propose a way to distinguish errors made in the extraction process from factual errors in the web source per se, by using joint inference in a novel multi-layer probabilistic model. We call the trustworthiness score we computed Knowledge-Based Trust (KBT). On synthetic data, we show that our method can reliably compute the true trustworthiness levels of the sources. We then apply it to a database of 2.8B facts extracted from the web, and thereby estimate the trustworthiness of 119M webpages. Manual evaluation of a subset of the results confirms the effectiveness of the method.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Han:2015:GUB, author = "Minyang Han and Khuzaima Daudjee", title = "{Giraph} unchained: barrierless asynchronous parallel execution in {Pregel}-like graph processing systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "9", pages = "950--961", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2777598.2777604", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 15 17:15:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The bulk synchronous parallel (BSP) model used by synchronous graph processing systems allows algorithms to be easily implemented and reasoned about. However, BSP can suffer from poor performance due to stale messages and frequent global synchronization barriers. Asynchronous computation models have been proposed to alleviate these overheads but existing asynchronous systems that implement such models have limited scalability or retain frequent global barriers, and do not always support graph mutations or algorithms with multiple computation phases. We propose barrierless asynchronous parallel (BAP), a new computation model that reduces both message staleness and global synchronization. This enables BAP to overcome the limitations of existing asynchronous models while retaining support for graph mutations and algorithms with multiple computation phases. We present GiraphUC, which implements our BAP model in the open source distributed graph processing system Giraph, and evaluate our system at scale with large real-world graphs on 64 EC2 machines. We show that GiraphUC provides across-the-board performance improvements of up to $ 5 \times $ faster over synchronous systems and up to an order of magnitude faster than asynchronous systems. Our results demonstrate that the BAP model provides efficient and transparent asynchronous execution of algorithms that are programmed synchronously.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bogh:2015:WEP, author = "Kenneth S. B{\o}gh and Sean Chester and Ira Assent", title = "Work-efficient parallel skyline computation for the {GPU}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "9", pages = "962--973", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2777598.2777605", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 15 17:15:24 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The skyline operator returns records in a dataset that provide optimal trade-offs of multiple dimensions. State-of-the-art skyline computation involves complex tree traversals, data-ordering, and conditional branching to minimize the number of point-to-point comparisons. Meanwhile, GPGPU computing offers the potential for parallelizing skyline computation across thousands of cores. However, attempts to port skyline algorithms to the GPU have prioritized throughput and failed to outperform sequential algorithms. In this paper, we introduce a new skyline algorithm, designed for the GPU, that uses a global, static partitioning scheme. With the partitioning, we can permit controlled branching to exploit transitive relationships and avoid most point-to-point comparisons. The result is a non-traditional GPU algorithm, SkyAlign, that prioritizes work-efficiency and respectable throughput, rather than maximal throughput, to achieve orders of magnitude faster performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lai:2015:SSE, author = "Longbin Lai and Lu Qin and Xuemin Lin and Lijun Chang", title = "Scalable subgraph enumeration in {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "10", pages = "974--985", month = jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2794367.2794368", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:06 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Subgraph enumeration, which aims to find all the subgraphs of a large data graph that are isomorphic to a given pattern graph, is a fundamental graph problem with a wide range of applications. However, existing sequential algorithms for subgraph enumeration fall short in handling large graphs due to the involvement of computationally intensive subgraph isomorphism operations. Thus, some recent researches focus on solving the problem using MapReduce. Nevertheless, exiting MapReduce approaches are not scalable to handle very large graphs since they either produce a huge number of partial results or consume a large amount of memory. Motivated by this, in this paper, we propose a new algorithm TwinTwigJoin based on a left-deep-join framework in MapReduce, in which the basic join unit is a TwinTwig (an edge or two incident edges of a node). We show that in the Erd{\H{o}}s--R{\'e}nyi random-graph model, TwinTwigJoin is instance optimal in the left-deep-join framework under reasonable assumptions, and we devise an algorithm to compute the optimal join plan. Three optimization strategies are explored to improve our algorithm. Furthermore, we discuss how our approach can be adapted in the power-law random-graph model. We conduct extensive performance studies in several real graphs, one of which contains billions of edges. Our approach significantly outperforms existing solutions in all tests.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Finis:2015:IHD, author = "Jan Finis and Robert Brunel and Alfons Kemper and Thomas Neumann and Norman May and Franz Faerber", title = "Indexing highly dynamic hierarchical data", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "10", pages = "986--997", month = jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2794367.2794369", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:06 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Maintaining and querying hierarchical data in a relational database system is an important task in many business applications. This task is especially challenging when considering dynamic use cases with a high rate of complex, possibly skewed structural updates. Labeling schemes are widely considered the indexing technique of choice for hierarchical data, and many different schemes have been proposed. However, they cannot handle dynamic use cases well due to various problems which we investigate in this paper. We therefore propose our dynamic Order Indexes, which offer competitive query performance, unprecedented update efficiency, and robustness for highly dynamic workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2015:CDS, author = "Meng Wang and Chaokun Wang and Jeffrey Xu Yu and Jun Zhang", title = "Community detection in social networks: an in-depth benchmarking study with a procedure-oriented framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "10", pages = "998--1009", month = jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2794367.2794370", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:06 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Revealing the latent community structure, which is crucial to understanding the features of networks, is an important problem in network and graph analysis. During the last decade, many approaches have been proposed to solve this challenging problem in diverse ways, i.e. different measures or data structures. Unfortunately, experimental reports on existing techniques fell short in validity and integrity since many comparisons were not based on a unified code base or merely discussed in theory. We engage in an in-depth benchmarking study of community detection in social networks. We formulate a generalized community detection procedure and propose a procedure-oriented framework for benchmarking. This framework enables us to evaluate and compare various approaches to community detection systematically and thoroughly under identical experimental conditions. Upon that we can analyze and diagnose the inherent defect of existing approaches deeply, and further make effective improvements correspondingly. We have re-implemented ten state-of-the-art representative algorithms upon this framework and make comprehensive evaluations of multiple aspects, including the efficiency evaluation, performance evaluations, sensitivity evaluations, etc. We discuss their merits and faults in depth, and draw a set of take-away interesting conclusions. In addition, we present how we can make diagnoses for these algorithms resulting in significant improvements.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kazemi:2015:GGM, author = "Ehsan Kazemi and S. Hamed Hassani and Matthias Grossglauser", title = "Growing a graph matching from a handful of seeds", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "10", pages = "1010--1021", month = jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2794367.2794371", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:06 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In many graph--mining problems, two networks from different domains have to be matched. In the absence of reliable node attributes, graph matching has to rely on only the link structures of the two networks, which amounts to a generalization of the classic graph isomorphism problem. Graph matching has applications in social--network reconciliation and de-anonymization, protein--network alignment in biology, and computer vision. The most scalable graph--matching approaches use ideas from percolation theory, where a matched node pair ``infects'' neighbouring pairs as additional potential matches. This class of matching algorithm requires an initial seed set of known matches to start the percolation. The size and correctness of the matching is very sensitive to the size of the seed set. In this paper, we give a new graph--matching algorithm that can operate with a much smaller seed set than previous approaches, with only a small increase in matching errors. We characterize a phase transition in matching performance as a function of the seed set size, using a random bigraph model and ideas from bootstrap percolation theory. We also show the excellent performance in matching several real large-scale social networks, using only a handful of seeds.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cheng:2015:RDB, author = "Peng Cheng and Xiang Lian and Zhao Chen and Rui Fu and Lei Chen and Jinsong Han and Jizhong Zhao", title = "Reliable diversity-based spatial crowdsourcing by moving workers", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "10", pages = "1022--1033", month = jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2794367.2794372", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:06 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the rapid development of mobile devices and the crowdsourcing platforms, the spatial crowdsourcing has attracted much attention from the database community, specifically, spatial crowdsourcing refers to sending a location-based request to workers according to their positions. In this paper, we consider an important spatial crowdsourcing problem, namely reliable diversity-based spatial crowdsourcing (RDB-SC), in which spatial tasks (such as taking videos/photos of a landmark or firework shows, and checking whether or not parking spaces are available) are time-constrained, and workers are moving towards some directions. Our RDB-SC problem is to assign workers to spatial tasks such that the completion reliability and the spatial/temporal diversities of spatial tasks are maximized. We prove that the RDB-SC problem is NP-hard and intractable. Thus, we propose three effective approximation approaches, including greedy, sampling, and divide-and-conquer algorithms. In order to improve the efficiency, we also design an effective cost-model-based index, which can dynamically maintain moving workers and spatial tasks with low cost, and efficiently facilitate the retrieval of RDB-SC answers. Through extensive experiments, we demonstrate the efficiency and effectiveness of our proposed approaches over both real and synthetic datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhou:2015:LHF, author = "Zhuojie Zhou and Nan Zhang and Gautam Das", title = "Leveraging history for faster sampling of online social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "10", pages = "1034--1045", month = jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2794367.2794373", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:06 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With a vast amount of data available on online social networks, how to enable efficient analytics over such data has been an increasingly important research problem. Given the sheer size of such social networks, many existing studies resort to sampling techniques that draw random nodes from an online social network through its restrictive web/API interface. While these studies differ widely in analytics tasks supported and algorithmic design, almost all of them use the exact same underlying technique of random walk --- a Markov Chain Monte Carlo based method which iteratively transits from one node to its random neighbor. Random walk fits naturally with this problem because, for most online social networks, the only query we can issue through the interface is to retrieve the neighbors of a given node (i.e., no access to the full graph topology). A problem with random walks, however, is the ``burn-in'' period which requires a large number of transitions/queries before the sampling distribution converges to a stationary value that enables the drawing of samples in a statistically valid manner. In this paper, we consider a novel problem of speeding up the fundamental design of random walks (i.e., reducing the number of queries it requires) without changing the stationary distribution it achieves --- thereby enabling a more efficient ``drop-in'' replacement for existing sampling-based analytics techniques over online social networks. Technically, our main idea is to leverage the history of random walks to construct a higher-ordered Markov chain. We develop two algorithms, Circulated Neighbors and Groupby Neighbors Random Walk (CNRW and GNRW) and rigidly prove that, no matter what the social network topology is, CNRW and GNRW offer better efficiency than baseline random walks while achieving the same stationary distribution. We demonstrate through extensive experiments on real-world social networks and synthetic graphs the superiority of our techniques over the existing ones.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ding:2015:TFE, author = "Yufei Ding and Xipeng Shen and Madanlal Musuvathi and Todd Mytkowicz", title = "{TOP}: a framework for enabling algorithmic optimizations for distance-related problems", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "10", pages = "1046--1057", month = jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2794367.2794374", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:06 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Computing distances among data points is an essential part of many important algorithms in data analytics, graph analysis, and other domains. In each of these domains, developers have spent significant manual effort optimizing algorithms, often through novel applications of the triangle equality, in order to minimize the number of distance computations in the algorithms. In this work, we observe that many algorithms across these domains can be generalized as an instance of a generic distance-related abstraction. Based on this abstraction, we derive seven principles for correctly applying the triangular inequality to optimize distance-related algorithms. Guided by the findings, we develop {Triangular} {OPtimizer} (TOP), the first software framework that is able to automatically produce optimized algorithms that either matches or outperforms manually designed algorithms for solving distance-related problems. TOP achieves up to 237x speedups and 2.5X on average.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Leis:2015:EPW, author = "Viktor Leis and Kan Kundhikanjana and Alfons Kemper and Thomas Neumann", title = "Efficient processing of window functions in analytical {SQL} queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "10", pages = "1058--1069", month = jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2794367.2794375", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:06 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Window functions, also known as analytic OLAP functions, have been part of the SQL standard for more than a decade and are now a widely-used feature. Window functions allow to elegantly express many useful query types including time series analysis, ranking, percentiles, moving averages, and cumulative sums. Formulating such queries in plain SQL-92 is usually both cumbersome and inefficient. Despite being supported by all major database systems, there have been few publications that describe how to implement an efficient relational window operator. This work aims at filling this gap by presenting an efficient and general algorithm for the window operator. Our algorithm is optimized for high-performance main-memory database systems and has excellent performance on modern multi-core CPUs. We show how to fully parallelize all phases of the operator in order to effectively scale for arbitrary input distributions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2015:RTT, author = "Yuchen Li and Dongxiang Zhang and Kian-Lee Tan", title = "Real-time targeted influence maximization for online advertisements", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "10", pages = "1070--1081", month = jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2794367.2794376", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:06 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Advertising in social network has become a multi-billion-dollar industry. A main challenge is to identify key influencers who can effectively contribute to the dissemination of information. Although the influence maximization problem, which finds a seed set of k most influential users based on certain propagation models, has been well studied, it is not target-aware and cannot be directly applied to online advertising. In this paper, we propose a new problem, named Keyword-Based Targeted Influence Maximization (KB-TIM), to find a seed set that maximizes the expected influence over users who are relevant to a given advertisement. To solve the problem, we propose a sampling technique based on weighted reverse influence set and achieve an approximation ratio of $ (1 - - 1 / e - - \epsilon) $. To meet the instant-speed requirement, we propose two disk-based solutions that improve the query processing time by two orders of magnitude over the state-of-the-art solutions, while keeping the theoretical bound. Experiments conducted on two real social networks confirm our theoretical findings as well as the efficiency. Given an advertisement with 5 keywords, it takes only 2 seconds to find the most influential users in a social network with billions of edges.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Papenbrock:2015:FDD, author = "Thorsten Papenbrock and Jens Ehrlich and Jannik Marten and Tommy Neubert and Jan-Peer Rudolph and Martin Sch{\"o}nberg and Jakob Zwiener and Felix Naumann", title = "Functional dependency discovery: an experimental evaluation of seven algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "10", pages = "1082--1093", month = jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2794367.2794377", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:06 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Functional dependencies are important metadata used for schema normalization, data cleansing and many other tasks. The efficient discovery of functional dependencies in tables is a well-known challenge in database research and has seen several approaches. Because no comprehensive comparison between these algorithms exist at the time, it is hard to choose the best algorithm for a given dataset. In this experimental paper, we describe, evaluate, and compare the seven most cited and most important algorithms, all solving this same problem. First, we classify the algorithms into three different categories, explaining their commonalities. We then describe all algorithms with their main ideas. The descriptions provide additional details where the original papers were ambiguous or incomplete. Our evaluation of careful re-implementations of all algorithms spans a broad test space including synthetic and real-world data. We show that all functional dependency algorithms optimize for certain data characteristics and provide hints on when to choose which algorithm. In summary, however, all current approaches scale surprisingly poorly, showing potential for future research.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kalinin:2015:SEI, author = "Alexander Kalinin and Ugur Cetintemel and Stan Zdonik", title = "{Searchlight}: enabling integrated search and exploration over large multidimensional data", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "10", pages = "1094--1105", month = jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2794367.2794378", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:06 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present a new system, called Searchlight, that uniquely integrates constraint solving and data management techniques. It allows Constraint Programming (CP) machinery to run efficiently inside a DBMS without the need to extract, transform and move the data. This marriage concurrently offers the rich expressiveness and efficiency of constraint-based search and optimization provided by modern CP solvers, and the ability of DBMSs to store and query data at scale, resulting in an enriched functionality that can effectively support both data- and search-intensive applications. As such, Searchlight is the first system to support generic search, exploration and mining over large multi-dimensional data collections, going beyond point algorithms designed for point search and mining tasks. Searchlight makes the following scientific contributions: o Constraint solvers as first-class citizens Instead of treating solver logic as a black-box, Searchlight provides native support, incorporating the necessary APIs for its specification and transparent execution as part of query plans, as well as novel algorithms for its optimized execution and parallelization. o Speculative solving Existing solvers assume that the entire data set is main-memory resident. Searchlight uses an innovative two stage Solve-Validate approach that allows it to operate speculatively yet safely on main-memory synopses, quickly producing candidate search results that can later be efficiently validated on real data. o Computation and I/O load balancing As CP solver logic can be computationally expensive, executing it on large search and data spaces requires novel CPU-I/O balancing approaches when performing search distribution. We built a prototype implementation of Searchlight on Google's Or-Tools, an open-source suite of operations research tools, and the array DBMS SciDB. Extensive experimental results show that Searchlight often performs orders of magnitude faster than the next best approach (SciDB-only or CP-solver-only) in terms of end response time and time to first result.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rahman:2015:PID, author = "Md Farhadur Rahman and Weimo Liu and Saravanan Thirumuruganathan and Nan Zhang and Gautam Das", title = "Privacy implications of database ranking", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "10", pages = "1106--1117", month = jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2794367.2794379", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:06 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In recent years, there has been much research in the adoption of Ranked Retrieval model (in addition to the Boolean retrieval model) in structured databases, especially those in a client-server environment (e.g., web databases). With this model, a search query returns top- k tuples according to not just exact matches of selection conditions, but a suitable ranking function. While much research has gone into the design of ranking functions and the efficient processing of top- k queries, this paper studies a novel problem on the privacy implications of database ranking. The motivation is a novel yet serious privacy leakage we found on real-world web databases which is caused by the ranking function design. Many such databases feature private attributes --- e.g., a social network allows users to specify certain attributes as only visible to him/herself, but not to others. While these websites generally respect the privacy settings by not directly displaying private attribute values in search query answers, many of them nevertheless take into account such private attributes in the ranking function design. The conventional belief might be that tuple ranks alone are not enough to reveal the private attribute values. Our investigation, however, shows that this is not the case in reality. To address the problem, we introduce a taxonomy of the problem space with two dimensions, (1) the type of query interface and (2) the capability of adversaries. For each subspace, we develop a novel technique which either guarantees the successful inference of private attributes, or does so for a significant portion of real-world tuples. We demonstrate the effectiveness and efficiency of our techniques through theoretical analysis, extensive experiments over real-world datasets, as well as successful online attacks over websites with tens to hundreds of millions of users --- e.g., Amazon Goodreads and Renren.com.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kohler:2015:PCS, author = "Henning K{\"o}hler and Sebastian Link and Xiaofang Zhou", title = "Possible and certain {SQL} keys", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1118--1129", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809975", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Driven by the dominance of the relational model, the requirements of modern applications, and the veracity of data, we revisit the fundamental notion of a key in relational databases with NULLs. In SQL database systems primary key columns are NOT NULL by default. NULL columns may occur in unique constraints which only guarantee uniqueness for tuples which do not feature null markers in any of the columns involved, and therefore serve a different function than primary keys. We investigate the notions of possible and certain keys, which are keys that hold in some or all possible worlds that can originate from an SQL table, respectively. Possible keys coincide with the unique constraint of SQL, and thus provide a semantics for their syntactic definition in the SQL standard. Certain keys extend primary keys to include NULL columns, and thus form a sufficient and necessary condition to identify tuples uniquely, while primary keys are only sufficient for that purpose. In addition to basic characterization, axiomatization, and simple discovery approaches for possible and certain keys, we investigate the existence and construction of Armstrong tables, and describe an indexing scheme for enforcing certain keys. Our experiments show that certain keys with NULLs do occur in real-world databases, and that related computational problems can be solved efficiently. Certain keys are therefore semantically well-founded and able to maintain data quality in the form of Codd's entity integrity rule while handling the requirements of modern applications, that is, higher volumes of incomplete data from different formats.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tang:2015:SSJ, author = "Yu Tang and Yilun Cai and Nikos Mamoulis", title = "Scaling similarity joins over tree-structured data", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1130--1141", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809976", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a large collection of tree-structured objects (e.g., XML documents), the similarity join finds the pairs of objects that are similar to each other, based on a similarity threshold and a tree edit distance measure. The state-of-the-art similarity join methods compare simpler approximations of the objects (e.g., strings), in order to prune pairs that cannot be part of the similarity join result based on distance bounds derived by the approximations. In this paper, we propose a novel similarity join approach, which is based on the dynamic decomposition of the tree objects into subgraphs, according to the similarity threshold. Our technique avoids computing the exact distance between two tree objects, if the objects do not share at least one common subgraph. In order to scale up the join, the computed subgraphs are managed in a two-layer index. Our experimental results on real and synthetic data collections show that our approach outperforms the state-of-the-art methods by up to an order of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rahman:2015:WSE, author = "Habibur Rahman and Saravanan Thirumuruganathan and Senjuti Basu Roy and Sihem Amer-Yahia and Gautam Das", title = "Worker skill estimation in team-based tasks", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1142--1153", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809977", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many emerging applications such as collaborative editing, multi-player games, or fan-subbing require to form a team of experts to accomplish a task together. Existing research has investigated how to assign workers to such team-based tasks to ensure the best outcome assuming the skills of individual workers to be known. In this work, we investigate how to estimate individual worker's skill based on the outcome of the team-based tasks they have undertaken. We consider two popular skill aggregation functions and estimate the skill of the workers, where skill is either a deterministic value or a probability distribution. We propose efficient solutions for worker skill estimation using continuous and discrete optimization techniques. We present comprehensive experiments and validate the scalability and effectiveness of our proposed solutions using multiple real-world datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{He:2015:DDP, author = "Xi He and Graham Cormode and Ashwin Machanavajjhala and Cecilia M. Procopiuc and Divesh Srivastava", title = "{DPT}: differentially private trajectory synthesis using hierarchical reference systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1154--1165", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809978", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "GPS-enabled devices are now ubiquitous, from airplanes and cars to smartphones and wearable technology. This has resulted in a wealth of data about the movements of individuals and populations, which can be analyzed for useful information to aid in city and traffic planning, disaster preparedness and so on. However, the places that people go can disclose extremely sensitive information about them, and thus their use needs to be filtered through privacy preserving mechanisms. This turns out to be a highly challenging task: raw trajectories are highly detailed, and typically no pair is alike. Previous attempts fail either to provide adequate privacy protection, or to remain sufficiently faithful to the original behavior. This paper presents DPT, a system to synthesize mobility data based on raw GPS trajectories of individuals while ensuring strong privacy protection in the form of $ \epsilon $-differential privacy. DPT makes a number of novel modeling and algorithmic contributions including (i) discretization of raw trajectories using hierarchical reference systems (at multiple resolutions) to capture individual movements at differing speeds, (ii) adaptive mechanisms to select a small set of reference systems and construct prefix tree counts privately, and (iii) use of direction-weighted sampling for improved utility. While there have been prior attempts to solve the subproblems required to generate synthetic trajectories, to the best of our knowledge, ours is the first system that provides an end-to-end solution. We show the efficacy of our synthetic trajectory generation system using an extensive empirical evaluation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2015:SSA, author = "Boduo Li and Yanlei Diao and Prashant Shenoy", title = "Supporting scalable analytics with latency constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1166--1177", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809979", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently there has been a significant interest in building big data analytics systems that can handle both ``big data'' and ``fast data''. Our work is strongly motivated by recent real-world use cases that point to the need for a general, unified data processing framework to support analytical queries with different latency requirements. Toward this goal, we start with an analysis of existing big data systems to understand the causes of high latency. We then propose an extended architecture with mini-batches as granularity for computation and shuffling, and augment it with new model-driven resource allocation and runtime scheduling techniques to meet user latency requirements while maximizing throughput. Results from real-world workloads show that our techniques, implemented in Incremental Hadoop, reduce its latency from tens of seconds to sub-second, with 2x-5x increase in throughput. Our system also outperforms state-of-the-art distributed stream systems, Storm and Spark Streaming, by 1-2 orders of magnitude when combining latency and throughput.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shiokawa:2015:SEA, author = "Hiroaki Shiokawa and Yasuhiro Fujiwara and Makoto Onizuka", title = "{SCAN++}: efficient algorithm for finding clusters, hubs and outliers on large-scale graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1178--1189", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809980", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph clustering is one of the key techniques for understanding the structures present in graphs. Besides cluster detection, identifying hubs and outliers is also a key task, since they have important roles to play in graph data mining. The structural clustering algorithm SCAN, proposed by Xu et al., is successfully used in many application because it not only detects densely connected nodes as clusters but also identifies sparsely connected nodes as hubs or outliers. However, it is difficult to apply SCAN to large-scale graphs due to its high time complexity. This is because it evaluates the density for all adjacent nodes included in the given graphs. In this paper, we propose a novel graph clustering algorithm named SCAN ++. In order to reduce time complexity, we introduce new data structure of directly two-hop-away reachable node set (DTAR). DTAR is the set of two-hop-away nodes from a given node that are likely to be in the same cluster as the given node. SCAN++ employs two approaches for efficient clustering by using DTARs without sacrificing clustering quality. First, it reduces the number of the density evaluations by computing the density only for the adjacent nodes such as indicated by DTARs. Second, by sharing a part of the density evaluations for DTARs, it offers efficient density evaluations of adjacent nodes. As a result, SCAN++ detects exactly the same clusters, hubs, and outliers from large-scale graphs as SCAN with much shorter computation time. Extensive experiments on both real-world and synthetic graphs demonstrate the performance superiority of SCAN++ over existing approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Faleiro:2015:RSM, author = "Jose M. Faleiro and Daniel J. Abadi", title = "Rethinking serializable multiversion concurrency control", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1190--1201", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809981", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Multi-versioned database systems have the potential to significantly increase the amount of concurrency in transaction processing because they can avoid read-write conflicts. Unfortunately, the increase in concurrency usually comes at the cost of transaction serializability. If a database user requests full serializability, modern multi-versioned systems significantly constrain read-write concurrency among conflicting transactions and employ expensive synchronization patterns in their design. In main-memory multi-core settings, these additional constraints are so burdensome that multi-versioned systems are often significantly outperformed by single-version systems. We propose B ohm, a new concurrency control protocol for main-memory multi-versioned database systems. Bohm guarantees serializable execution while ensuring that reads never block writes. In addition, Bohm does not require reads to perform any bookkeeping whatsoever, thereby avoiding the overhead of tracking reads via contended writes to shared memory. This leads to excellent scalability and performance in multi-core settings. Bohm has all the above characteristics without performing validation based concurrency control. Instead, it is pessimistic, and is therefore not prone to excessive aborts in the presence of contention. An experimental evaluation shows that Bohm performs well in both high contention and low contention settings, and is able to dramatically outperform state-of-the-art multi-versioned systems despite maintaining the full set of serializability guarantees.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Brancotte:2015:RAT, author = "Bryan Brancotte and Bo Yang and Guillaume Blin and Sarah Cohen-Boulakia and Alain Denise and Sylvie Hamel", title = "Rank aggregation with ties: experiments and analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1202--1213", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809982", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The problem of aggregating multiple rankings into one consensus ranking is an active research topic especially in the database community. Various studies have implemented methods for rank aggregation and may have come up with contradicting conclusions upon which algorithms work best. Comparing such results is cumbersome, as the original studies mixed different approaches and used very different evaluation datasets and metrics. Additionally, in real applications, the rankings to be aggregated may not be permutations where elements are strictly ordered, but they may have ties where some elements are placed at the same position. However, most of the studies have not considered ties. This paper introduces the first large scale study of algorithms for rank aggregation with ties. More precisely, (i) we review rank aggregation algorithms and determine whether or not they can handle ties; (ii) we propose the first implementation to compute the exact solution of the Rank Aggregation with ties problem; (iii) we evaluate algorithms for rank aggregation with ties on a very large panel of both real and carefully generated synthetic datasets; (iv) we provide guidance on the algorithms to be favored depending on dataset features.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sundaram:2015:GHP, author = "Narayanan Sundaram and Nadathur Satish and Md Mostofa Ali Patwary and Subramanya R. Dulloor and Michael J. Anderson and Satya Gautam Vadlamudi and Dipankar Das and Pradeep Dubey", title = "{GraphMat}: high performance graph analytics made productive", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1214--1225", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809983", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given the growing importance of large-scale graph analytics, there is a need to improve the performance of graph analysis frameworks without compromising on productivity. GraphMat is our solution to bridge this gap between a user-friendly graph analytics framework and native, hand-optimized code. GraphMat functions by taking vertex programs and mapping them to high performance sparse matrix operations in the backend. We thus get the productivity benefits of a vertex programming framework without sacrificing performance. GraphMat is a single-node multicore graph framework written in C++ which has enabled us to write a diverse set of graph algorithms with the same effort compared to other vertex programming frameworks. GraphMat performs 1.1-7X faster than high performance frameworks such as GraphLab, CombBLAS and Galois. GraphMat also matches the performance of MapGraph, a GPU-based graph framework, despite running on a CPU platform with significantly lower compute and bandwidth resources. It achieves better multicore scalability (13-15X on 24 cores) than other frameworks and is 1.2X off native, hand-optimized code on a variety of graph algorithms. Since GraphMat performance depends mainly on a few scalable and well-understood sparse matrix operations, GraphMat can naturally benefit from the trend of increasing parallelism in future hardware.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2015:MKC, author = "Kai Zhang and Kaibo Wang and Yuan Yuan and Lei Guo and Rubao Lee and Xiaodong Zhang", title = "{Mega-KV}: a case for {GPUs} to maximize the throughput of in-memory key--value stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1226--1237", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809984", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In-memory key--value stores play a critical role in data processing to provide high throughput and low latency data accesses. In-memory key--value stores have several unique properties that include (1) data intensive operations demanding high memory bandwidth for fast data accesses, (2) high data parallelism and simple computing operations demanding many slim parallel computing units, and (3) a large working set. As data volume continues to increase, our experiments show that conventional and general-purpose multicore systems are increasingly mismatched to the special properties of key--value stores because they do not provide massive data parallelism and high memory bandwidth; the powerful but the limited number of computing cores do not satisfy the demand of the unique data processing task; and the cache hierarchy may not well benefit to the large working set. In this paper, we make a strong case for GPUs to serve as special-purpose devices to greatly accelerate the operations of in-memory key--value stores. Specifically, we present the design and implementation of Mega-KV, a GPU-based in-memory key--value store system that achieves high performance and high throughput. Effectively utilizing the high memory bandwidth and latency hiding capability of GPUs, Mega-KV provides fast data accesses and significantly boosts overall performance. Running on a commodity PC installed with two CPUs and two GPUs, Mega-KV can process up to 160+ million key--value operations per second, which is 1.4-2.8 times as fast as the state-of-the-art key--value store system on a conventional CPU-based platform.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kim:2015:TSI, author = "Jinha Kim and Hyungyu Shin and Wook-Shin Han and Sungpack Hong and Hassan Chafi", title = "Taming subgraph isomorphism for {RDF} query processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1238--1249", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809985", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "RDF data are used to model knowledge in various areas such as life sciences, Semantic Web, bioinformatics, and social graphs. The size of real RDF data reaches billions of triples. This calls for a framework for efficiently processing RDF data. The core function of processing RDF data is subgraph pattern matching. There have been two completely different directions for supporting efficient subgraph pattern matching. One direction is to develop specialized RDF query processing engines exploiting the properties of RDF data for the last decade, while the other direction is to develop efficient subgraph isomorphism algorithms for general, labeled graphs for over 30 years. Although both directions have a similar goal (i.e., finding subgraphs in data graphs for a given query graph), they have been independently researched without clear reason. We argue that a subgraph isomorphism algorithm can be easily modified to handle the graph homomorphism, which is the RDF pattern matching semantics, by just removing the injectivity constraint. In this paper, based on the state-of-the-art subgraph isomorphism algorithm, we propose an in-memory solution, Turbo$_{HOM + +}$, which is tamed for the RDF processing, and we compare it with the representative RDF processing engines for several RDF benchmarks in a server machine where billions of triples can be loaded in memory. In order to speed up Turbo$_{HOM + +}$, we also provide a simple yet effective transformation and a series of optimization techniques. Extensive experiments using several RDF benchmarks show that Turbo$_{HOM + +}$ consistently and significantly outperforms the representative RDF engines. Specifically, Turbo$_{HOM + +}$ outperforms its competitors by up to five orders of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jiang:2015:SPI, author = "Lilong Jiang and Arnab Nandi", title = "{SnapToQuery}: providing interactive feedback during exploratory query specification", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1250--1261", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809986", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A critical challenge in the data exploration process is discovering and issuing the ``right'' query, especially when the space of possible queries is large. This problem of exploratory query specification is exacerbated by the use of interactive user interfaces driven by mouse, touch, or next-generation, three-dimensional, motion capture-based devices; which, are often imprecise due to jitter and sensitivity issues. In this paper, we propose SnapToQuery, a novel technique that guides users through the query space by providing interactive feedback during the query specification process by ``snapping'' to the user's likely intended queries. These intended queries can be derived from prior query logs, or from the data itself, using methods described in this paper. In order to provide interactive response times over large datasets, we propose two data reduction techniques when snapping to these queries. Performance experiments demonstrate that our algorithms help maintain an interactive experience while allowing for accurate guidance. User studies over three kinds of devices (mouse, touch, and motion capture) show that SnapToQuery can help users specify queries quicker and more accurately; resulting in a query specification time speedup of $ 1.4 \times $ for mouse and touch-based devices and $ 2.2 \times $ for motion capture-based devices.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhou:2015:GFI, author = "Yang Zhou and Ling Liu and Kisung Lee and Qi Zhang", title = "{GraphTwist}: fast iterative graph computation with two-tier optimizations", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1262--1273", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809987", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large-scale real-world graphs are known to have highly skewed vertex degree distribution and highly skewed edge weight distribution. Existing vertex-centric iterative graph computation models suffer from a number of serious problems: (1) poor performance of parallel execution due to inherent workload imbalance at vertex level; (2) inefficient CPU resource utilization due to short execution time for low-degree vertices compared to the cost of in-memory or on-disk vertex access; and (3) incapability of pruning insignificant vertices or edges to improve the computational performance. In this paper, we address the above technical challenges by designing and implementing a scalable, efficient, and provably correct two-tier graph parallel processing system, GraphTwist. At storage and access tier, GraphTwist maximizes parallel efficiency by employing three graph parallel abstractions for partitioning a big graph by slice, strip or dice based partitioning techniques. At computation tier, GraphTwist presents two utility-aware pruning strategies: slice pruning and cut pruning, to further improve the computational performance while preserving the computational utility defined by graph applications. Theoretic analysis is provided to quantitatively prove that iterative graph computations powered by utility-aware pruning techniques can achieve a very good approximation with bounds on the introduced error.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Inoue:2015:SCF, author = "Hiroshi Inoue and Kenjiro Taura", title = "{SIMD}- and cache-friendly algorithm for sorting an array of structures", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1274--1285", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809988", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper describes our new algorithm for sorting an array of structures by efficiently exploiting the SIMD instructions and cache memory of today's processors. Recently, multiway mergesort implemented with SIMD instructions has been used as a high-performance in-memory sorting algorithm for sorting integer values. For sorting an array of structures with SIMD instructions, a frequently used approach is to first pack the key and index for each record into an integer value, sort the key-index pairs using SIMD instructions, then rearrange the records based on the sorted key-index pairs. This approach can efficiently exploit SIMD instructions because it sorts the key-index pairs while packed into integer values; hence, it can use existing high-performance sorting implementations of the SIMD-based multiway mergesort for integers. However, this approach has frequent cache misses in the final rearranging phase due to its random and scattered memory accesses so that this phase limits both single-thread performance and scalability with multiple cores. Our approach is also based on multiway mergesort, but it can avoid costly random accesses for rearranging the records while still efficiently exploiting the SIMD instructions. Our results showed that our approach exhibited up to 2.1x better single-thread performance than the key-index approach implemented with SIMD instructions when sorting 512M 16-byte records on one core. Our approach also yielded better performance when we used multiple cores. Compared to an optimized radix sort, our vectorized multiway mergesort achieved better performance when the each record is large. Our vectorized multiway mergesort also yielded higher scalability with multiple cores than the radix sort.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Song:2015:EDI, author = "Shaoxu Song and Aoqian Zhang and Lei Chen and Jianmin Wang", title = "Enriching data imputation with extensive similarity neighbors", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1286--1297", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809989", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Incomplete information often occur along with many database applications, e.g., in data integration, data cleaning or data exchange. The idea of data imputation is to fill the missing data with the values of its neighbors who share the same information. Such neighbors could either be identified certainly by editing rules or statistically by relational dependency networks. Unfortunately, owing to data sparsity, the number of neighbors (identified w.r.t. value equality) is rather limited, especially in the presence of data values with variances. In this paper, we argue to extensively enrich similarity neighbors by similarity rules with tolerance to small variations. More fillings can thus be acquired that the aforesaid equality neighbors fail to reveal. To fill the missing values more, we study the problem of maximizing the missing data imputation. Our major contributions include (1) the np-hardness analysis on solving and approximating the problem, (2) exact algorithms for tackling the problem, and (3) efficient approximation with performance guarantees. Experiments on real and synthetic data sets demonstrate that the filling accuracy can be improved.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Makreshanski:2015:LSE, author = "Darko Makreshanski and Justin Levandoski and Ryan Stutsman", title = "To lock, swap, or elide: on the interplay of hardware transactional memory and lock-free indexing", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1298--1309", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809990", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The release of hardware transactional memory (HTM) in commodity CPUs has major implications on the design and implementation of main-memory databases, especially on the architecture of high-performance lock-free indexing methods at the core of several of these systems. This paper studies the interplay of HTM and lock-free indexing methods. First, we evaluate whether HTM will obviate the need for crafty lock-free index designs by integrating it in a traditional B-tree architecture. HTM performs well for simple data sets with small fixed-length keys and payloads, but its benefits disappear for more complex scenarios (e.g., larger variable-length keys and payloads), making it unattractive as a general solution for achieving high performance. Second, we explore fundamental differences between HTM-based and lock-free B-tree designs. While lock-freedom entails design complexity and extra mechanism, it has performance advantages in several scenarios, especially high-contention cases where readers proceed uncontested (whereas HTM aborts readers). Finally, we explore the use of HTM as a method to simplify lock-free design. We find that using HTM to implement a multi-word compare-and-swap greatly reduces lock-free programming complexity at the cost of only a 10--15\% performance degradation. Our study uses two state-of-the-art index implementations: a memory-optimized B-tree extended with HTM to provide multi-threaded concurrency and the Bw-tree lock-free B-tree used in several Microsoft production environments.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shin:2015:IKB, author = "Jaeho Shin and Sen Wu and Feiran Wang and Christopher {De Sa} and Ce Zhang and Christopher R{\'e}", title = "Incremental knowledge base construction using {DeepDive}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1310--1321", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809991", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Populating a database with unstructured information is a long-standing problem in industry and research that encompasses problems of extraction, cleaning, and integration. Recent names used for this problem include dealing with dark data and knowledge base construction (KBC). In this work, we describe DeepDive, a system that combines database and machine learning ideas to help develop KBC systems, and we present techniques to make the KBC process more efficient. We observe that the KBC process is iterative, and we develop techniques to incrementally produce inference results for KBC systems. We propose two methods for incremental inference, based respectively on sampling and variational techniques. We also study the tradeoff space of these methods and develop a simple rule-based optimizer. DeepDive includes all of these contributions, and we evaluate DeepDive on five KBC systems, showing that it can speed up KBC inference tasks by up to two orders of magnitude with negligible impact on quality.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qian:2015:LUP, author = "Li Qian and Jinyang Gao and H. V. Jagadish", title = "Learning user preferences by adaptive pairwise comparison", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1322--1333", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809992", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Users make choices among multi-attribute objects in a data set in a variety of domains including used car purchase, job search and hotel room booking. Individual users sometimes have strong preferences between objects, but these preferences may not be universally shared by all users. If we can cast these preferences as derived from a quantitative user-specific preference function, then we can predict user preferences by learning their preference function, even though the preference function itself is not directly observable, and may be hard to express. In this paper we study the problem of preference learning with pairwise comparisons on a set of entities with multiple attributes. We formalize the problem into two subproblems, namely preference estimation and comparison selection. We propose an innovative approach to estimate the preference, and introduce a binary search strategy to adaptively select the comparisons. We introduce the concept of an orthogonal query to support this adaptive selection, as well as a novel S-tree index to enable efficient evaluation of orthogonal queries. We integrate these components into a system for inferring user preference with adaptive pairwise comparisons. Our experiments and user study demonstrate that our adaptive system significantly outperforms the na{\"\i}ve random selection system on both real data and synthetic data, with either simulated or real user feedback. We also show our preference learning approach is much more effective than existing approaches, and our S-tree can be constructed efficiently and perform orthogonal query at interactive speeds.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2015:AEL, author = "Weimo Liu and Md Farhadur Rahman and Saravanan Thirumuruganathan and Nan Zhang and Gautam Das", title = "Aggregate estimations over location based services", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1334--1345", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824034", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Location based services (LBS) have become very popular in recent years. They range from map services (e.g., Google Maps) that store geographic locations of points of interests, to online social networks (e.g., WeChat, Sina Weibo, FourSquare) that leverage user geographic locations to enable various recommendation functions. The public query interfaces of these services may be abstractly modeled as a k NN interface over a database of two dimensional points on a plane: given an arbitrary query point, the system returns the k points in the database that are nearest to the query point. In this paper we consider the problem of obtaining approximate estimates of SUM and COUNT aggregates by only querying such databases via their restrictive public interfaces. We distinguish between interfaces that return location information of the returned tuples (e.g., Google Maps), and interfaces that do not return location information (e.g., Sina Weibo). For both types of interfaces, we develop aggregate estimation algorithms that are based on novel techniques for precisely computing or approximately estimating the Voronoi cell of tuples. We discuss a comprehensive set of real-world experiments for testing our algorithms, including experiments on Google Maps, WeChat, and Sina Weibo.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bhattacherjee:2015:PDV, author = "Souvik Bhattacherjee and Amit Chavan and Silu Huang and Amol Deshpande and Aditya Parameswaran", title = "Principles of dataset versioning: exploring the recreation\slash storage tradeoff", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1346--1357", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824035", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The relative ease of collaborative data science and analysis has led to a proliferation of many thousands or millions of versions of the same datasets in many scientific and commercial domains, acquired or constructed at various stages of data analysis across many users, and often over long periods of time. Managing, storing, and recreating these dataset versions is a non-trivial task. The fundamental challenge here is the storage-recreation trade-off: the more storage we use, the faster it is to recreate or retrieve versions, while the less storage we use, the slower it is to recreate or retrieve versions. Despite the fundamental nature of this problem, there has been a surprisingly little amount of work on it. In this paper, we study this trade-off in a principled manner: we formulate six problems under various settings, trading off these quantities in various ways, demonstrate that most of the problems are intractable, and propose a suite of inexpensive heuristics drawing from techniques in delay-constrained scheduling, and spanning tree literature, to solve these problems. We have built a prototype version management system, that aims to serve as a foundation to our D ataHub system for facilitating collaborative data science. We demonstrate, via extensive experiments, that our proposed heuristics provide efficient solutions in practical dataset versioning scenarios.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{He:2015:SJJ, author = "Yeye He and Kris Ganjam and Xu Chu", title = "{SEMA--JOIN}: joining semantically-related tables using big table corpora", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1358--1369", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824036", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Join is a powerful operator that combines records from two or more tables, which is of fundamental importance in the field of relational database. However, traditional join processing mostly relies on string equality comparisons. Given the growing demand for ad-hoc data analysis, we have seen an increasing number of scenarios where the desired join relationship is not equi-join. For example, in a spreadsheet environment, a user may want to join one table with a subject column country-name, with another table with a subject column country-code. Traditional equi-join cannot handle such joins automatically, and the user typically has to manually find an intermediate mapping table in order to perform the desired join. We develop a SEMA-JOIN approach that is a first step toward allowing users to perform semantic join automatically, with a click of the button. Our main idea is to utilize a data-driven method that leverages a big table corpus with over 100 million tables to determine statistical correlation between cell values at both row-level and column-level. We use the intuition that the correct join mapping is the one that maximizes aggregate pairwise correlation, to formulate the join prediction problem as an optimization problem. We develop a linear program relaxation and a rounding argument to obtain a 2-approximation algorithm in polynomial time. Our evaluation using both public tables from the Web and proprietary Enterprise tables from a large company shows that the proposed approach can perform automatic semantic joins with high precision for a variety of common join scenarios.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Krishnan:2015:SVC, author = "Sanjay Krishnan and Jiannan Wang and Michael J. Franklin and Ken Goldberg and Tim Kraska", title = "Stale view cleaning: getting fresh answers from stale materialized views", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1370--1381", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824037", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Materialized views (MVs), stored pre-computed results, are widely used to facilitate fast queries on large datasets. When new records arrive at a high rate, it is infeasible to continuously update (maintain) MVs and a common solution is to defer maintenance by batching updates together. Between batches the MVs become increasingly stale with incorrect, missing, and superfluous rows leading to increasingly inaccurate query results. We propose Stale View Cleaning (SVC) which addresses this problem from a data cleaning perspective. In SVC, we efficiently clean a sample of rows from a stale MV, and use the clean sample to estimate aggregate query results. While approximate, the estimated query results reflect the most recent data. As sampling can be sensitive to long-tailed distributions, we further explore an outlier indexing technique to give increased accuracy when the data distributions are skewed. SVC complements existing deferred maintenance approaches by giving accurate and bounded query answers between maintenance. We evaluate our method on a generated dataset from the TPC-D benchmark and a real video distribution application. Experiments confirm our theoretical results: (1) cleaning an MV sample is more efficient than full view maintenance, (2) the estimated results are more accurate than using the stale MV, and (3) SVC is applicable for a wide variety of MVs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nagarkar:2015:CSH, author = "Parth Nagarkar and K. Sel{\c{c}}uk Candan and Aneesha Bhat", title = "Compressed spatial hierarchical bitmap {(cSHB)} indexes for efficiently processing spatial range query workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1382--1393", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824038", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In most spatial data management applications, objects are represented in terms of their coordinates in a 2-dimensional space and search queries in this space are processed using spatial index structures. On the other hand, bitmap-based indexing, especially thanks to the compression opportunities bitmaps provide, has been shown to be highly effective for query processing workloads including selection and aggregation operations. In this paper, we show that bitmap-based indexing can also be highly effective for managing spatial data sets. More specifically, we propose a novel compressed spatial hierarchical bitmap (cSHB) index structure to support spatial range queries. We consider query workloads involving multiple range queries over spatial data and introduce and consider the problem of bitmap selection for identifying the appropriate subset of the bitmap files for processing the given spatial range query workload. We develop cost models for compressed domain range query processing and present query planning algorithms that not only select index nodes for query processing, but also associate appropriate bitwise logical operations to identify the data objects satisfying the range queries in the given workload. Experiment results confirm the efficiency and effectiveness of the proposed compressed spatial hierarchical bitmap (cSHB) index structure and the range query planning algorithms in supporting spatial range query workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deutch:2015:SPD, author = "Daniel Deutch and Amir Gilad and Yuval Moskovitch", title = "Selective provenance for datalog programs using top-$k$ queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1394--1405", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824039", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Highly expressive declarative languages, such as datalog, are now commonly used to model the operational logic of data-intensive applications. The typical complexity of such datalog programs, and the large volume of data that they process, call for result explanation. Results may be explained through the tracking and presentation of data provenance, and here we focus on a detailed form of provenance ( how-provenance), defining it as the set of derivation trees of a given fact. While informative, the size of such full provenance information is typically too large and complex (even when compactly represented) to allow displaying it to the user. To this end, we propose a novel top-$k$ query language for querying datalog provenance, supporting selection criteria based on tree patterns and ranking based on the rules and database facts used in derivation. We propose an efficient novel algorithm based on (1) instrumenting the datalog program so that, upon evaluation, it generates only relevant provenance, and (2) efficient top-$k$ (relevant) provenance generation, combined with bottom-up datalog evaluation. The algorithm computes in polynomial data complexity a compact representation of the top-$k$ trees which may be explicitly constructed in linear time with respect to their size. We further experimentally study the algorithm performance, showing its scalability even for complex datalog programs where full provenance tracking is infeasible.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Park:2015:PPS, author = "Yoonjae Park and Jun-Ki Min and Kyuseok Shim", title = "Processing of probabilistic skyline queries using {MapReduce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1406--1417", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824040", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "There has been an increased growth in a number of applications that naturally generate large volumes of uncertain data. By the advent of such applications, the support of advanced analysis queries such as the skyline and its variant operators for big uncertain data has become important. In this paper, we propose the effective parallel algorithms using MapReduce to process the probabilistic skyline queries for uncertain data modeled by both discrete and continuous models. We present three filtering methods to identify probabilistic non-skyline objects in advance. We next develop a single MapReduce phase algorithm PS-QP-MR by utilizing space partitioning based on a variant of quadtrees to distribute the instances of objects effectively and the enhanced algorithm PS-QPF-MR by applying the three filtering methods additionally. We also propose the workload balancing technique to balance the workload of reduce functions based on the number of machines available. Finally, we present the brute-force algorithms PS-BR-MR and PS-BRF-MR with partitioning randomly and applying the filtering methods. In our experiments, we demonstrate the efficiency and scalability of PS-QPF-MR compared to the other algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2015:BVS, author = "Xiaofei Zhang and Hong Cheng and Lei Chen", title = "Bonding vertex sets over distributed graph: a betweenness aware approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1418--1429", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824041", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given two sets of vertices in a graph, it is often of a great interest to find out how these vertices are connected, especially to identify the vertices of high prominence defined on the topological structure. In this work, we formally define a V ertex S et B onding query (shorted as VSB), which returns a minimum set of vertices with the maximum importance w.r.t total betweenness and shortest path reachability in connecting two sets of input vertices. We find that such a kind of query is representative and could be widely applied in many real world scenarios, e.g., logistic planning, social community bonding and etc. Challenges are that many of such applications are constructed on graphs that are too large to fit in single server, and the VSB query evaluation turns to be NP-hard. To cope with the scalability issue and return the near optimal result in almost real time, we propose a generic solution framework on a shared nothing distributed environment. With the development of two novel techniques, guided graph exploration and betweenness ranking on exploration, we are able to efficiently evaluate queries for error bounded results with bounded space cost. We demonstrate the effectiveness of our solution with extensive experiments over both real and synthetic large graphs on the Google's Cloud platform. Comparing to the exploration only baseline method, our method achieves several times of speedup.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Amsterdamer:2015:NLI, author = "Yael Amsterdamer and Anna Kukliansky and Tova Milo", title = "A natural language interface for querying general and individual knowledge", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1430--1441", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824042", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many real-life scenarios require the joint analysis of general knowledge, which includes facts about the world, with individual knowledge, which relates to the opinions or habits of individuals. Recently developed crowd mining platforms, which were designed for such tasks, are a major step towards the solution. However, these platforms require users to specify their information needs in a formal, declarative language, which may be too complicated for na{\"\i}ve users. To make the joint analysis of general and individual knowledge accessible to the public, it is desirable to provide an interface that translates the user questions, posed in natural language (NL), into the formal query languages that crowd mining platforms support. While the translation of NL questions to queries over conventional databases has been studied in previous work, a setting with mixed individual and general knowledge raises unique challenges. In particular, to support the distinct query constructs associated with these two types of knowledge, the NL question must be partitioned and translated using different means; yet eventually all the translated parts should be seamlessly combined to a well-formed query. To account for these challenges, we design and implement a modular translation framework that employs new solutions along with state-of-the art NL parsing tools. The results of our experimental study, involving real user questions on various topics, demonstrate that our framework provides a high-quality translation for many questions that are not handled by previous translation tools.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Psaroudakis:2015:SCM, author = "Iraklis Psaroudakis and Tobias Scheuer and Norman May and Abdelkader Sellami and Anastasia Ailamaki", title = "Scaling up concurrent main-memory column-store scans: towards adaptive {NUMA}-aware data and task placement", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1442--1453", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824043", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Main-memory column-stores are called to efficiently use modern non-uniform memory access (NUMA) architectures to service concurrent clients on big data. The efficient usage of NUMA architectures depends on the data placement and scheduling strategy of the column-store. Most column-stores choose a static strategy that involves partitioning all data across the NUMA architecture, and employing a stealing-based task scheduler. In this paper, we implement different strategies for data placement and task scheduling for the case of concurrent scans. We compare these strategies with an extensive sensitivity analysis. Our most significant findings include that unnecessary partitioning can hurt throughput by up to 70\%, and that stealing memory-intensive tasks can hurt throughput by up to 58\%. Based on our analysis, we envision a design that adapts the data placement and task scheduling strategy to the workload.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Oh:2015:SOP, author = "Gihwan Oh and Sangchul Kim and Sang-Won Lee and Bongki Moon", title = "{SQLite} optimization with phase change memory for mobile applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1454--1465", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824044", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given its pervasive use in smart mobile platforms, there is a compelling need to optimize the performance of sluggish SQLite databases. Popular mobile applications such as messenger, email and social network services rely on SQLite for their data management need. Those mobile applications tend to execute relatively short transactions in the autocommit mode for transactional consistency in databases. This often has adverse effect on the flash memory storage in mobile devices because the small random updates cause high write amplification and high write latency. In order to address this problem, we propose a new optimization strategy, called per-page logging (PPL), for mobile data management, and have implemented the key functions in SQLite/PPL. The hardware component of SQLite/PPL includes phase change memory (PCM) with a byte-addressable, persistent memory abstraction. By capturing an update in a physiological log record and adding it to the PCM log sector, SQLite/PPL can replace a multitude of successive page writes made to the same logical page with much smaller log writes done to PCM much more efficiently. We have observed that SQLite/PPL would potentially improve the performance of mobile applications by an order of magnitude while supporting transactional atomicity and durability.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Crotty:2015:ACU, author = "Andrew Crotty and Alex Galakatos and Kayhan Dursun and Tim Kraska and Carsten Binnig and Ugur Cetintemel and Stan Zdonik", title = "An architecture for compiling {UDF}-centric workflows", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1466--1477", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824045", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data analytics has recently grown to include increasingly sophisticated techniques, such as machine learning and advanced statistics. Users frequently express these complex analytics tasks as workflows of user-defined functions (UDFs) that specify each algorithmic step. However, given typical hardware configurations and dataset sizes, the core challenge of complex analytics is no longer sheer data volume but rather the computation itself, and the next generation of analytics frameworks must focus on optimizing for this computation bottleneck. While query compilation has gained widespread popularity as a way to tackle the computation bottleneck for traditional SQL workloads, relatively little work addresses UDF-centric workflows in the domain of complex analytics. In this paper, we describe a novel architecture for automatically compiling workflows of UDFs. We also propose several optimizations that consider properties of the data, UDFs, and hardware together in order to generate different code on a case-by-case basis. To evaluate our approach, we implemented these techniques in Tupleware, a new high-performance distributed analytics system, and our benchmarks show performance improvements of up to three orders of magnitude compared to alternative systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Margo:2015:SDG, author = "Daniel Margo and Margo Seltzer", title = "A scalable distributed graph partitioner", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1478--1489", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824046", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present Scalable Host-tree Embeddings for Efficient Partitioning (Sheep), a distributed graph partitioning algorithm capable of handling graphs that far exceed main memory. Sheep produces high quality edge partitions an order of magnitude faster than both state of the art offline (e.g., METIS) and streaming partitioners (e.g., Fennel). Sheep's partitions are independent of the input graph distribution, which means that graph elements can be assigned to processing nodes arbitrarily without affecting the partition quality. Sheep transforms the input graph into a strictly smaller elimination tree via a distributed map-reduce operation. By partitioning this tree, Sheep finds an upper-bounded communication volume partitioning of the original graph. We describe the Sheep algorithm and analyze its space-time requirements, partition quality, and intuitive characteristics and limitations. We compare Sheep to contemporary partitioners and demonstrate that Sheep creates competitive partitions, scales to larger graphs, and has better runtime.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sharov:2015:TMY, author = "Artyom Sharov and Alexander Shraer and Arif Merchant and Murray Stokely", title = "Take me to your leader!: online optimization of distributed storage configurations", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1490--1501", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824047", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The configuration of a distributed storage system typically includes, among other parameters, the set of servers and their roles in the replication protocol. Although mechanisms for changing the configuration at runtime exist, it is usually left to system administrators to manually determine the ``best'' configuration and periodically reconfigure the system, often by trial and error. This paper describes a new workload-driven optimization framework that dynamically determines the optimal configuration at run-time. We focus on optimizing leader and quorum based replication schemes and divide the framework into three optimization tiers, dynamically optimizing different configuration aspects: (1) leader placement, (2) roles of different servers in the replication protocol, and (3) replica locations. We showcase our optimization framework by applying it to a large-scale distributed storage system used internally in Google and demonstrate that most client applications significantly benefit from using our framework, reducing average operation latency by up to 94\%.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2015:ARG, author = "Wenfei Fan and Xin Wang and Yinghui Wu and Jingbo Xu", title = "Association rules with graph patterns", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1502--1513", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824048", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose graph-pattern association rules (GPARs) for social media marketing. Extending association rules for item-sets, GPARs help us discover regularities between entities in social graphs, and identify potential customers by exploring social influence. We study the problem of discovering top- k diversified GPARs. While this problem is NP-hard, we develop a parallel algorithm with accuracy bound. We also study the problem of identifying potential customers with GPARs. While it is also NP-hard, we provide a parallel scalable algorithm that guarantees a polynomial speedup over sequential algorithms with the increase of processors. Using real-life and synthetic graphs, we experimentally verify the scalability and effectiveness of the algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kimmett:2015:FJM, author = "Ben Kimmett and Venkatesh Srinivasan and Alex Thomo", title = "Fuzzy joins in {MapReduce}: an experimental study", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1514--1517", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824049", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We report experimental results for the MapReduce algorithms proposed by Afrati, Das Sarma, Menestrina, Parameswaran and Ullman in ICDE'12 to compute fuzzy joins of binary strings using Hamming Distance. Their algorithms come with complete theoretical analysis, however, no experimental evaluation is provided. They argue that there is a tradeoff between communication cost and processing cost, and that there is a skyline of the proposed algorithms; i.e. none dominates another. We observe via experiments that, from a practical point of view, some algorithms are almost always preferable to others. We provide detailed experimental results and insights that show the different facets of each algorithm.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cho:2015:PEP, author = "Minsik Cho and Daniel Brand and Rajesh Bordawekar and Ulrich Finkler and Vincent Kulandaisamy and Ruchir Puri", title = "{PARADIS}: an efficient parallel algorithm for in-place radix sort", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1518--1529", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824050", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In-place radix sort is a popular distribution-based sorting algorithm for short numeric or string keys due to its linear run-time and constant memory complexity. However, efficient parallelization of in-place radix sort is very challenging for two reasons. First, the initial phase of permuting elements into buckets suffers read-write dependency inherent in its in-place nature. Secondly, load balancing of the recursive application of the algorithm to the resulting buckets is difficult when the buckets are of very different sizes, which happens for skewed distributions of the input data. In this paper, we present a novel parallel in-place radix sort algorithm, PARADIS, which addresses both problems: (a) ``speculative permutation'' solves the first problem by assigning multiple non-continuous array stripes to each processor. The resulting shared-nothing scheme achieves full parallelization. Since our speculative permutation is not complete, it is followed by a ``repair'' phase, which can again be done in parallel without any data sharing among the processors. (b) ``distribution-adaptive load balancing'' solves the second problem. We dynamically allocate processors in the context of radix sort, so as to minimize the overall completion time. Our experimental results show that PARADIS offers excellent performance/scalability on a wide range of input data sets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Vengerov:2015:JSE, author = "David Vengerov and Andre Cavalheiro Menck and Mohamed Zait and Sunil P. Chakkappen", title = "Join size estimation subject to filter conditions", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1530--1541", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824051", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we present a new algorithm for estimating the size of equality join of multiple database tables. The proposed algorithm, Correlated Sampling, constructs a small space synopsis for each table, which can then be used to provide a quick estimate of the join size of this table with other tables subject to dynamically specified predicate filter conditions, possibly specified over multiple columns (attributes) of each table. This algorithm makes a single pass over the data and is thus suitable for streaming scenarios. We compare this algorithm analytically to two other previously known sampling approaches (independent Bernoulli Sampling and End-Biased Sampling) and to a novel sketch-based approach. We also compare these four algorithms experimentally and show that results fully correspond to our analytical predictions based on derived expressions for the estimator variances, with Correlated Sampling giving the best estimates in a large range of situations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2015:AFT, author = "Jingjing Wang and Magdalena Balazinska and Daniel Halperin", title = "Asynchronous and fault-tolerant recursive datalog evaluation in shared-nothing engines", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1542--1553", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824052", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present a new approach for data analytics with iterations. Users express their analysis in Datalog with bag-monotonic aggregate operators, which enables the expression of computations from a broad variety of application domains. Queries are translated into query plans that can execute in shared-nothing engines, are incremental, and support a variety of iterative models (synchronous, asynchronous, different processing priorities) and failure-handling techniques. The plans require only small extensions to an existing shared-nothing engine, making the approach easily implementable. We implement the approach in the Myria big-data management system and use our implementation to empirically study the performance characteristics of different combinations of iterative models, failure handling methods, and applications. Our evaluation uses workloads from a variety of application domains. We find that no single method outperforms others but rather that application properties must drive the selection of the iterative query execution model.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mouratidis:2015:MRQ, author = "Kyriakos Mouratidis and Jilian Zhang and HweeHwa Pang", title = "Maximum rank query", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1554--1565", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824053", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The top- k query is a common means to shortlist a number of options from a set of alternatives, based on the user's preferences. Typically, these preferences are expressed as a vector of query weights, defined over the options' attributes. The query vector implicitly associates each alternative with a numeric score, and thus imposes a ranking among them. The top- k result includes the k options with the highest scores. In this context, we define the maximum rank query (MaxRank). Given a focal option in a set of alternatives, the MaxRank problem is to compute the highest rank this option may achieve under any possible user preference, and furthermore, to report all the regions in the query vector's domain where that rank is achieved. MaxRank finds application in market impact analysis, customer profiling, targeted advertising, etc. We propose a methodology for MaxRank processing and evaluate it with experiments on real and benchmark synthetic datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Katsarou:2015:PSI, author = "Foteini Katsarou and Nikos Ntarmos and Peter Triantafillou", title = "Performance and scalability of indexed subgraph query processing methods", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1566--1577", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824054", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph data management systems have become very popular as graphs are the natural data model for many applications. One of the main problems addressed by these systems is subgraph query processing; i.e., given a query graph, return all graphs that contain the query. The naive method for processing such queries is to perform a subgraph isomorphism test against each graph in the dataset. This obviously does not scale, as subgraph isomorphism is NP-Complete. Thus, many indexing methods have been proposed to reduce the number of candidate graphs that have to underpass the subgraph isomorphism test. In this paper, we identify a set of key factors-parameters, that influence the performance of related methods: namely, the number of nodes per graph, the graph density, the number of distinct labels, the number of graphs in the dataset, and the query graph size. We then conduct comprehensive and systematic experiments that analyze the sensitivity of the various methods on the values of the key parameters. Our aims are twofold: first to derive conclusions about the algorithms' relative performance, and, second, to stress-test all algorithms, deriving insights as to their scalability, and highlight how both performance and scalability depend on the above factors. We choose six well-established indexing methods, namely Grapes, CT-Index, GraphGrepSX, gIndex, Tree+ $ \Delta $, and gCode, as representative approaches of the overall design space, including the most recent and best performing methods. We report on their index construction time and index size, and on query processing performance in terms of time and false positive ratio. We employ both real and synthetic datasets. Specifically, four real datasets of different characteristics are used: AIDS, PDBS, PCM, and PPI. In addition, we generate a large number of synthetic graph datasets, empowering us to systematically study the algorithms' performance and scalability versus the aforementioned key parameters.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2015:LDA, author = "Ying Yang and Niccol{\`o} Meneghetti and Ronny Fehling and Zhen Hua Liu and Oliver Kennedy", title = "{Lenses}: an on-demand approach to {ETL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1578--1589", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824055", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Three mentalities have emerged in analytics. One view holds that reliable analytics is impossible without high-quality data, and relies on heavy-duty ETL processes and upfront data curation to provide it. The second view takes a more ad-hoc approach, collecting data into a data lake, and placing responsibility for data quality on the analyst querying it. A third, on-demand approach has emerged over the past decade in the form of numerous systems like Paygo or HLog, which allow for incremental curation of the data and help analysts to make principled trade-offs between data quality and effort. Though quite useful in isolation, these systems target only specific quality problems (e.g., Paygo targets only schema matching and entity resolution). In this paper, we explore the design of a general, extensible infrastructure for on-demand curation that is based on probabilistic query processing. We illustrate its generality through examples and show how such an infrastructure can be used to gracefully make existing ETL workflows ``on-demand''. Finally, we present a user interface for On-Demand ETL and address ensuing challenges, including that of efficiently ranking potential data curation tasks. Our experimental results show that On-Demand ETL is feasible and that our greedy ranking strategy for curation tasks, called CPI, is effective.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2015:KG, author = "Wenfei Fan and Zhe Fan and Chao Tian and Xin Luna Dong", title = "Keys for graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1590--1601", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824056", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Keys for graphs aim to uniquely identify entities represented by vertices in a graph. We propose a class of keys that are recursively defined in terms of graph patterns, and are interpreted with subgraph isomorphism. Extending conventional keys for relations and XML, these keys find applications in object identification, knowledge fusion and social network reconciliation. As an application, we study the entity matching problem that, given a graph $G$ and a set $ \Sigma $ of keys, is to find all pairs of entities (vertices) in $G$ that are identified by keys in $ \Sigma $. We show that the problem is intractable, and cannot be parallelized in logarithmic rounds. Nonetheless, we provide two parallel scalable algorithms for entity matching, in MapReduce and a vertex-centric asynchronous model. Using real-life and synthetic data, we experimentally verify the effectiveness and scalability of the algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Eldawy:2015:SPT, author = "Ahmed Eldawy and Louai Alarabi and Mohamed F. Mokbel", title = "Spatial partitioning techniques in {SpatialHadoop}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1602--1605", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824057", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "SpatialHadoop is an extended MapReduce framework that supports global indexing that spatial partitions the data across machines providing orders of magnitude speedup, compared to traditional Hadoop. In this paper, we describe seven alternative partitioning techniques and experimentally study their effect on the quality of the generated index and the performance of range and spatial join queries. We found that using a 1\% sample is enough to produce high quality partitions. Also, we found that the total area of partitions is a reasonable measure of the quality of indexes when running spatial join. This study will assist researchers in choosing a good spatial partitioning technique in distributed environments.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Manabe:2015:ELH, author = "Tomohiro Manabe and Keishi Tajima", title = "Extracting logical hierarchical structure of {HTML} documents based on headings", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1606--1617", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824058", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose a method for extracting logical hierarchical structure of HTML documents. Because mark-up structure in HTML documents does not necessarily coincide with logical hierarchical structure, it is not trivial how to extract logical structure of HTML documents. Human readers, however, easily understand their logical structure. The key information used by them is headings in the documents. Human readers exploit the following properties of headings: (1) headings appear at the beginning of the corresponding blocks, (2) headings are given prominent visual styles, (3) headings of the same level share the same visual style, and (4) headings of higher levels are given more prominent visual styles. Our method also exploits these properties for extracting hierarchical headings and their associated blocks. Our experiment shows that our method outperforms existing methods. In addition, our method extracts not only hierarchical blocks but also their associated headings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Naidan:2015:PSM, author = "Bilegsaikhan Naidan and Leonid Boytsov and Eric Nyberg", title = "Permutation search methods are efficient, yet faster search is possible", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1618--1629", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824059", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We survey permutation-based methods for approximate k -nearest neighbor search. In these methods, every data point is represented by a ranked list of pivots sorted by the distance to this point. Such ranked lists are called permutations. The underpinning assumption is that, for both metric and non-metric spaces, the distance between permutations is a good proxy for the distance between original points. Thus, it should be possible to efficiently retrieve most true nearest neighbors by examining only a tiny subset of data points whose permutations are similar to the permutation of a query. We further test this assumption by carrying out an extensive experimental evaluation where permutation methods are pitted against state-of-the art benchmarks (the multi-probe LSH, the VP-tree, and proximity-graph based retrieval) on a variety of realistically large data set from the image and textual domain. The focus is on the high-accuracy retrieval methods for generic spaces. Additionally, we assume that both data and indices are stored in main memory. We find permutation methods to be reasonably efficient and describe a setup where these methods are most useful. To ease reproducibility, we make our software and data sets publicly available.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mukherjee:2015:DAO, author = "Niloy Mukherjee and Shasank Chavan and Maria Colgan and Dinesh Das and Mike Gleeson and Sanket Hase and Allison Holloway and Hui Jin and Jesse Kamp and Kartik Kulkarni and Tirthankar Lahiri and Juan Loaiza and Neil Macnaughton and Vineet Marwah and Atrayee Mullick and Andy Witkowski and Jiaqi Yan and Mohamed Zait", title = "Distributed architecture of {Oracle} database in-memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1630--1641", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824061", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Over the last few years, the information technology industry has witnessed revolutions in multiple dimensions. Increasing ubiquitous sources of data have posed two connected challenges to data management solutions --- processing unprecedented volumes of data, and providing ad-hoc real-time analysis in mainstream production data stores without compromising regular transactional workload performance. In parallel, computer hardware systems are scaling out elastically, scaling up in the number of processors and cores, and increasing main memory capacity extensively. The data processing challenges combined with the rapid advancement of hardware systems has necessitated the evolution of a new breed of main-memory databases optimized for mixed OLTAP environments and designed to scale. The Oracle RDBMS In-memory Option (DBIM) is an industry-first distributed dual format architecture that allows a database object to be stored in columnar format in main memory highly optimized to break performance barriers in analytic query workloads, simultaneously maintaining transactional consistency with the corresponding OLTP optimized row-major format persisted in storage and accessed through database buffer cache. In this paper, we present the distributed, highly-available, and fault-tolerant architecture of the Oracle DBIM that enables the RDBMS to transparently scale out in a database cluster, both in terms of memory capacity and query processing throughput. We believe that the architecture is unique among all mainstream in-memory databases. It allows complete application-transparent, extremely scalable and automated distribution of Oracle RDBMS objects in-memory across a cluster, as well as across multiple NUMA nodes within a single server. It seamlessly provides distribution awareness to the Oracle SQL execution framework through affinitized fault-tolerant parallel execution within and across servers without explicit optimizer plan changes or query rewrites.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Haas:2015:AMC, author = "Daniel Haas and Jason Ansel and Lydia Gu and Adam Marcus", title = "{Argonaut}: macrotask crowdsourcing for complex data processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1642--1653", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824062", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Crowdsourced workflows are used in research and industry to solve a variety of tasks. The databases community has used crowd workers in query operators/optimization and for tasks such as entity resolution. Such research utilizes microtasks where crowd workers are asked to answer simple yes/no or multiple choice questions with little training. Typically, microtasks are used with voting algorithms to combine redundant responses from multiple crowd workers to achieve result quality. Microtasks are powerful, but fail in cases where larger context (e.g., domain knowledge) or significant time investment is needed to solve a problem, for example in large-document structured data extraction. In this paper, we consider context-heavy data processing tasks that may require many hours of work, and refer to such tasks as macrotasks. Leveraging the infrastructure and worker pools of existing crowdsourcing platforms, we automate macrotask scheduling, evaluation, and pay scales. A key challenge in macrotask-powered work, however, is evaluating the quality of a worker's output, since ground truth is seldom available and redundancy-based quality control schemes are impractical. We present Argonaut, a framework that improves macrotask powered work quality using a hierarchical review. Argonaut uses a predictive model of worker quality to select trusted workers to perform review, and a separate predictive model of task quality to decide which tasks to review. Finally, Argonaut can identify the ideal trade-off between a single phase of review and multiple phases of review given a constrained review budget in order to maximize overall output quality. We evaluate an industrial use of Argonaut to power a structured data extraction pipeline that has utilized over half a million hours of crowd worker input to complete millions of macrotasks. We show that Argonaut can capture up to 118\% more errors than random spot-check reviews in review budget-constrained environments with up to two review layers.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2015:BRL, author = "Guozhang Wang and Joel Koshy and Sriram Subramanian and Kartik Paramasivam and Mammad Zadeh and Neha Narkhede and Jun Rao and Jay Kreps and Joe Stein", title = "Building a replicated logging system with {Apache Kafka}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1654--1655", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824063", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Apache Kafka is a scalable publish-subscribe messaging system with its core architecture as a distributed commit log. It was originally built at LinkedIn as its centralized event pipelining platform for online data integration tasks. Over the past years developing and operating Kafka, we extend its log-structured architecture as a replicated logging backbone for much wider application scopes in the distributed environment. In this abstract, we will talk about our design and engineering experience to replicate Kafka logs for various distributed data-driven systems at LinkedIn, including source-of-truth data storage and stream processing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Loro:2015:ISH, author = "Alessandra Loro and Anja Gruenheid and Donald Kossmann and Damien Profeta and Philippe Beaudequin", title = "Indexing and selecting hierarchical business logic", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1656--1667", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824064", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Business rule management is the task of storing and maintaining company-specific decision rules and business logic that is queried frequently by application users. These rules can impede efficient query processing when they require the business rule engine to resolve semantic hierarchies. To address this problem, this work discusses hierarchical indexes that are performance and storage-conscious. In the first part of this work, we develop a tree-based hierarchical structure that represents client-defined semantic hierarchies as well as two variants of this structure that improve performance and main memory allocation. The second part of our work focuses on selecting the top rules out of those retrieved from the index. We formally define a priority score-based decision scheme that allows for a conflict-free rule system and efficient rule ranking. Additionally, we introduce a weight-based lazy merging technique for rule selection. All of these techniques are evaluated with real world and synthetic data sets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shukla:2015:SAI, author = "Dharma Shukla and Shireesh Thota and Karthik Raman and Madhan Gajendran and Ankur Shah and Sergii Ziuzin and Krishnan Sundaram and Miguel Gonzalez Guajardo and Anna Wawrzyniak and Samer Boshra and Renato Ferreira and Mohamed Nassar and Michael Koltachev and Ji Huang and Sudipta Sengupta and Justin Levandoski and David Lomet", title = "Schema-agnostic indexing with {Azure DocumentDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1668--1679", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824065", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Azure DocumentDB is Microsoft's multi-tenant distributed database service for managing JSON documents at Internet scale. DocumentDB is now generally available to Azure developers. In this paper, we describe the DocumentDB indexing subsystem. DocumentDB indexing enables automatic indexing of documents without requiring a schema or secondary indices. Uniquely, DocumentDB provides real-time consistent queries in the face of very high rates of document updates. As a multi-tenant service, DocumentDB is designed to operate within extremely frugal resource budgets while providing predictable performance and robust resource isolation to its tenants. This paper describes the DocumentDB capabilities, including document representation, query language, document indexing approach, core index support, and early production experiences.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Boutin:2015:JRI, author = "Eric Boutin and Paul Brett and Xiaoyu Chen and Jaliya Ekanayake and Tao Guan and Anna Korsun and Zhicheng Yin and Nan Zhang and Jingren Zhou", title = "{JetScope}: reliable and interactive analytics at cloud scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1680--1691", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824066", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Interactive, reliable, and rich data analytics at cloud scale is a key capability to support low latency data exploration and experimentation over terabytes of data for a wide range of business scenarios. Besides the challenges in massive scalability and low latency distributed query processing, it is imperative to achieve all these requirements with effective fault tolerance and efficient recovery, as failures and fluctuations are the norm in such a distributed environment. We present a cloud scale interactive query processing system, called JetScope, developed at Microsoft. The system has a SQL-like declarative scripting language and delivers massive scalability and high performance through advanced optimizations. In order to achieve low latency, the system leverages various access methods, optimizes delivering first rows, and maximizes network and scheduling efficiency. The system also provides a fine-grained fault tolerance mechanism which is able to efficiently detect and mitigate failures without significantly impacting the query latency and user experience. JetScope has been deployed to hundreds of servers in production at Microsoft, serving a few million queries every day.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hu:2015:DPT, author = "Xueyang Hu and Mingxuan Yuan and Jianguo Yao and Yu Deng and Lei Chen and Qiang Yang and Haibing Guan and Jia Zeng", title = "Differential privacy in telco big data platform", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1692--1703", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824067", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Differential privacy (DP) has been widely explored in academia recently but less so in industry possibly due to its strong privacy guarantee. This paper makes the first attempt to implement three basic DP architectures in the deployed telecommunication (telco) big data platform for data mining applications. We find that all DP architectures have less than 5\% loss of prediction accuracy when the weak privacy guarantee is adopted (e.g., privacy budget parameter $ \epsilon \geq 3$). However, when the strong privacy guarantee is assumed (e.g., privacy budget parameter $ \epsilon \leq = 0 \colon 1$), all DP architectures lead to 15\%--30\% accuracy loss, which implies that real-word industrial data mining systems cannot work well under such a strong privacy guarantee recommended by previous research works. Among the three basic DP architectures, the Hybridized DM (Data Mining) and DB (Database) architecture performs the best because of its complicated privacy protection design for the specific data mining algorithm. Through extensive experiments on big data, we also observe that the accuracy loss increases by increasing the variety of features, but decreases by increasing the volume of training data. Therefore, to make DP practically usable in large-scale industrial systems, our observations suggest that we may explore three possible research directions in future: (1) Relaxing the privacy guarantee (e.g., increasing privacy budget $ \epsilon $) and studying its effectiveness on specific industrial applications; (2) Designing specific privacy scheme for specific data mining algorithms; and (3) Using large volume of data but with low variety for training the classification models.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{El-Helw:2015:OCT, author = "Amr El-Helw and Venkatesh Raghavan and Mohamed A. Soliman and George Caragea and Zhongxian Gu and Michalis Petropoulos", title = "Optimization of common table expressions in {MPP} database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1704--1715", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824068", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Big Data analytics often include complex queries with similar or identical expressions, usually referred to as Common Table Expressions (CTEs). CTEs may be explicitly defined by users to simplify query formulations, or implicitly included in queries generated by business intelligence tools, financial applications and decision support systems. In Massively Parallel Processing (MPP) database systems, CTEs pose new challenges due to the distributed nature of query processing, the overwhelming volume of underlying data and the scalability criteria that systems are required to meet. In these settings, the effective optimization and efficient execution of CTEs are crucial for the timely processing of analytical queries over Big Data. In this paper, we present a comprehensive framework for the representation, optimization and execution of CTEs in the context of Orca --- Pivotal's query optimizer for Big Data. We demonstrate experimentally the benefits of our techniques using industry standard decision support benchmark.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Goel:2015:TSR, author = "Anil K. Goel and Jeffrey Pound and Nathan Auch and Peter Bumbulis and Scott MacLean and Franz F{\"a}rber and Francis Gropengiesser and Christian Mathis and Thomas Bodner and Wolfgang Lehner", title = "Towards scalable real-time analytics: an architecture for scale-out of {OLxP} workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1716--1727", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824069", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present an overview of our work on the SAP HANA Scale-out Extension, a novel distributed database architecture designed to support large scale analytics over real-time data. This platform permits high performance OLAP with massive scale-out capabilities, while concurrently allowing OLTP workloads. This dual capability enables analytics over real-time changing data and allows fine grained user-specified service level agreements (SLAs) on data freshness. We advocate the decoupling of core database components such as query processing, concurrency control, and persistence, a design choice made possible by advances in high-throughput low-latency networks and storage devices. We provide full ACID guarantees and build on a logical timestamp mechanism to provide MVCC-based snapshot isolation, while not requiring synchronous updates of replicas. Instead, we use asynchronous update propagation guaranteeing consistency with timestamp validation. We provide a view into the design and development of a large scale data management platform for real-time analytics, driven by the needs of modern enterprise customers.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dasu:2015:FMF, author = "Tamraparni Dasu and Vladislav Shkapenyuk and Divesh Srivastava and Deborah F. Swayne", title = "{FIT} to monitor feed quality", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1728--1739", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824070", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "While there has been significant focus on collecting and managing data feeds, it is only now that attention is turning to their quality. In this paper, we propose a principled approach to online data quality monitoring in a dynamic feed environment. Our goal is to alert quickly when feed behavior deviates from expectations. We make contributions in two distinct directions. First, we propose novel enhancements to permit a publish-subscribe approach to incorporate data quality modules into the DFMS architecture. Second, we propose novel temporal extensions to standard statistical techniques to adapt them to online feed monitoring for outlier detection and alert generation at multiple scales along three dimensions: aggregation at multiple time intervals to detect at varying levels of sensitivity; multiple lengths of data history for varying the speed at which models adapt to change; and multiple levels of monitoring delay to address lagged data arrival. FIT, or Feed Inspection Tool, is the result of a successful implementation of our approach. We present several case studies outlining the effective deployment of FIT in real applications along with user testimonials.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Larson:2015:RTA, author = "Per-{\AA}ke Larson and Adrian Birka and Eric N. Hanson and Weiyun Huang and Michal Nowakiewicz and Vassilis Papadimos", title = "Real-time analytical processing with {SQL} server", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1740--1751", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824071", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Over the last two releases SQL Server has integrated two specialized engines into the core system: the Apollo column store engine for analytical workloads and the Hekaton in-memory engine for high-performance OLTP workloads. There is an increasing demand for real-time analytics, that is, for running analytical queries and reporting on the same system as transaction processing so as to have access to the freshest data. SQL Server 2016 will include enhancements to column store indexes and in-memory tables that significantly improve performance on such hybrid workloads. This paper describes four such enhancements: column store indexes on in-memory tables, making secondary column store indexes on disk-based tables updatable, allowing B-tree indexes on primary column store indexes, and further speeding up the column store scan oper ator.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2015:EEO, author = "You Wu and Boulos Harb and Jun Yang and Cong Yu", title = "Efficient evaluation of object-centric exploration queries for visualization", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1752--1763", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824072", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The most effective way to explore data is through visualizing the results of exploration queries. For example, an exploration query could be an aggregate of some measures over time intervals, and a pattern or abnormality can be discovered through a time series plot of the query results. In this paper, we examine a special kind of exploration query, namely object-centric exploration query. Common examples include claims made about athletes in sports databases, such as ``it is newsworthy that LeBron James has scored 35 or more points in nine consecutive games.'' We focus on one common type of visualization, i.e., 2d scatter plot with heatmap. Namely, we consider exploration queries whose results can be plotted on a two-dimensional space, possibly with colors indicating object densities in regions. While we model results as pairs of numbers, the types of the queries are limited only by the users' imagination. In the LeBron James example above, the two dimensions are minimum points scored per game and number of consecutive games, respectively. It is easy to find other equally interesting dimensions, such as minimum rebounds per game or number of playoff games. We formalize this problem and propose an efficient, interactive-speed algorithm that takes a user-provided exploration query (which can be a blackbox function) and produces an approximate visualization that preserves the two most important visual properties: the outliers and the overall distribution of all result points.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qiao:2015:GUD, author = "Lin Qiao and Yinan Li and Sahil Takiar and Ziyang Liu and Narasimha Veeramreddy and Min Tu and Ying Dai and Issac Buenrostro and Kapil Surlaker and Shirshanka Das and Chavdar Botev", title = "{Gobblin}: unifying data ingestion for {Hadoop}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1764--1769", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824073", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data ingestion is an essential part of companies and organizations that collect and analyze large volumes of data. This paper describes Gobblin, a generic data ingestion framework for Hadoop and one of LinkedIn's latest open source products. At LinkedIn we need to ingest data from various sources such as relational stores, NoSQL stores, streaming systems, REST endpoints, filesystems, etc. into our Hadoop clusters. Maintaining independent pipelines for each source can lead to various operational problems. Gobblin aims to solve this issue by providing a centralized data ingestion framework that makes it easy to support ingesting data from a variety of sources. Gobblin distinguishes itself from similar frameworks by focusing on three core principles: generality, extensibility, and operability. Gobblin supports a mixture of data sources out-of-the-box and can be easily extended for more. This enables an organization to use a single framework to handle different data ingestion needs, making it easy and inexpensive to operate. Moreover, with an end-to-end metrics collection and reporting module, Gobblin makes it simple and efficient to identify issues in production.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Das:2015:QOO, author = "Dinesh Das and Jiaqi Yan and Mohamed Zait and Satyanarayana R. Valluri and Nirav Vyas and Ramarajan Krishnamachari and Prashant Gaharwar and Jesse Kamp and Niloy Mukherjee", title = "Query optimization in {Oracle 12c} database in-memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1770--1781", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824074", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Traditional on-disk row major tables have been the dominant storage mechanism in relational databases for decades. Over the last decade, however, with explosive growth in data volume and demand for faster analytics, has come the recognition that a different data representation is needed. There is widespread agreement that in-memory column-oriented databases are best suited to meet the realities of this new world. Oracle 12c Database In-memory, the industry's first dual-format database, allows existing row major on-disk tables to have complementary in-memory columnar representations. The new storage format brings new data processing techniques and query execution algorithms and thus new challenges for the query optimizer. Execution plans that are optimal for one format may be sub-optimal for the other. In this paper, we describe the changes made in the query optimizer to generate execution plans optimized for the specific format --- row major or columnar --- that will be scanned during query execution. With enhancements in several areas --- statistics, cost model, query transformation, access path and join optimization, parallelism, and cluster-awareness --- the query optimizer plays a significant role in unlocking the full promise and performance of Oracle Database In-Memory.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Green:2015:LPL, author = "Todd J. Green and Dan Olteanu and Geoffrey Washburn", title = "Live programming in the {LogicBlox} system: a {MetaLogiQL} approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1782--1791", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824075", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The emerging category of self-service enterprise applications motivates support for ``live programming'' in the database, where the user's iterative data exploration triggers changes to installed application code and its output in real time. This paper discusses the technical challenges in supporting live programming in the database and presents the solution implemented in the LogicBlox commercial system. The workhorse architectural component is a ``meta-engine'' that incrementally maintains metadata representing application code, guides its compilation into an internal representation in the database kernel, and orchestrates maintenance of materialized views based on those changes. Our approach mirrors LogicBlox's declarative programming model and describes the maintenance of application code using declarative meta-rules; the meta-engine is essentially a ``bootstrap'' version of the database engine proper. Beyond live programming, the meta-engine turns out effective for a range of static analysis and optimization tasks. Outside of the database context, we speculate that our design may even provide a novel means of building incremental compilers for general-purpose programming languages.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Akidau:2015:DMP, author = "Tyler Akidau and Robert Bradshaw and Craig Chambers and Slava Chernyak and Rafael J. Fern{\'a}ndez-Moctezuma and Reuven Lax and Sam McVeety and Daniel Mills and Frances Perry and Eric Schmidt and Sam Whittle", title = "The dataflow model: a practical approach to balancing correctness, latency, and cost in massive-scale, unbounded, out-of-order data processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1792--1803", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824076", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Unbounded, unordered, global-scale datasets are increasingly common in day-to-day business (e.g. Web logs, mobile usage statistics, and sensor networks). At the same time, consumers of these datasets have evolved sophisticated requirements, such as event-time ordering and windowing by features of the data themselves, in addition to an insatiable hunger for faster answers. Meanwhile, practicality dictates that one can never fully optimize along all dimensions of correctness, latency, and cost for these types of input. As a result, data processing practitioners are left with the quandary of how to reconcile the tensions between these seemingly competing propositions, often resulting in disparate implementations and systems. We propose that a fundamental shift of approach is necessary to deal with these evolved requirements in modern data processing. We as a field must stop trying to groom unbounded datasets into finite pools of information that eventually become complete, and instead live and breathe under the assumption that we will never know if or when we have seen all of our data, only that new data will arrive, old data may be retracted, and the only way to make this problem tractable is via principled abstractions that allow the practitioner the choice of appropriate tradeoffs along the axes of interest: correctness, latency, and cost. In this paper, we present one such approach, the Dataflow Model, along with a detailed examination of the semantics it enables, an overview of the core principles that guided its design, and a validation of the model itself via the real-world experiences that led to its development.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ching:2015:OTE, author = "Avery Ching and Sergey Edunov and Maja Kabiljo and Dionysios Logothetis and Sambavi Muthukrishnan", title = "One trillion edges: graph processing at {Facebook}-scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1804--1815", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824077", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Analyzing large graphs provides valuable insights for social networking and web companies in content ranking and recommendations. While numerous graph processing systems have been developed and evaluated on available benchmark graphs of up to 6.6B edges, they often face significant difficulties in scaling to much larger graphs. Industry graphs can be two orders of magnitude larger --- hundreds of billions or up to one trillion edges. In addition to scalability challenges, real world applications often require much more complex graph processing workflows than previously evaluated. In this paper, we describe the usability, performance, and scalability improvements we made to Apache Giraph, an open-source graph processing system, in order to use it on Facebook-scale graphs of up to one trillion edges. We also describe several key extensions to the original Pregel model that make it possible to develop a broader range of production graph applications and workflows as well as improve code reuse. Finally, we report on real-world operations as well as performance characteristics of several large-scale production applications.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pelkonen:2015:GFS, author = "Tuomas Pelkonen and Scott Franklin and Justin Teller and Paul Cavallaro and Qi Huang and Justin Meza and Kaushik Veeraraghavan", title = "{Gorilla}: a fast, scalable, in-memory time series database", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1816--1827", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824078", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large-scale internet services aim to remain highly available and responsive in the presence of unexpected failures. Providing this service often requires monitoring and analyzing tens of millions of measurements per second across a large number of systems, and one particularly effective solution is to store and query such measurements in a time series database (TSDB). A key challenge in the design of TSDBs is how to strike the right balance between efficiency, scalability, and reliability. In this paper we introduce Gorilla, Facebook's in-memory TSDB. Our insight is that users of monitoring systems do not place much emphasis on individual data points but rather on aggregate analysis, and recent data points are of much higher value than older points to quickly detect and diagnose the root cause of an ongoing problem. Gorilla optimizes for remaining highly available for writes and reads, even in the face of failures, at the expense of possibly dropping small amounts of data on the write path. To improve query efficiency, we aggressively leverage compression techniques such as delta-of-delta timestamps and XOR'd floating point values to reduce Gorilla's storage footprint by 10x. This allows us to store Gorilla's data in memory, reducing query latency by 73x and improving query throughput by 14x when compared to a traditional database (HBase)-backed time series data. This performance improvement has unlocked new monitoring and debugging tools, such as time series correlation search and more dense visualization tools. Gorilla also gracefully handles failures from a single-node to entire regions with little to no operational overhead.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Potharaju:2015:CLC, author = "Rahul Potharaju and Joseph Chan and Luhui Hu and Cristina Nita-Rotaru and Mingshi Wang and Liyuan Zhang and Navendu Jain", title = "{ConfSeer}: leveraging customer support knowledge bases for automated misconfiguration detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1828--1839", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824079", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We introduce ConfSeer, an automated system that detects potential configuration issues or deviations from identified best practices by leveraging a knowledge base (KB) of technical solutions. The intuition is that these KB articles describe the configuration problems and their fixes so if the system can accurately understand them, it can automatically pinpoint both the errors and their resolution. Unfortunately, finding an accurate match is difficult because (a) the KB articles are written in natural language text, and (b) configuration files typically contain a large number of parameters with a high value range. Thus, expert-driven manual troubleshooting is not scalable. While there are several state-of-the-art techniques proposed for individual tasks such as keyword matching, concept determination and entity resolution, none offer a practical end-to-end solution to detect problems in machine configurations. In this paper, we describe our experiences building ConfSeer using a novel combinations of ideas from natural language processing, information retrieval and interactive learning. ConfSeer powers the recommendation engine behind Microsoft Operations Management Suite that proposes fixes for software configuration errors. The system has been running in production for about a year to proactively find misconfigurations on tens of thousands of servers. Our evaluation of ConfSeer against an expert-defined rule-based commercial system, an expert survey and web search engines shows that it achieves 80\%-97.5\% accuracy and incurs low runtime overheads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Armbrust:2015:SSR, author = "Michael Armbrust and Tathagata Das and Aaron Davidson and Ali Ghodsi and Andrew Or and Josh Rosen and Ion Stoica and Patrick Wendell and Reynold Xin and Matei Zaharia", title = "Scaling spark in the real world: performance and usability", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1840--1843", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824080", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Apache Spark is one of the most widely used open source processing engines for big data, with rich language-integrated APIs and a wide range of libraries. Over the past two years, our group has worked to deploy Spark to a wide range of organizations through consulting relationships as well as our hosted service, Databricks. We describe the main challenges and requirements that appeared in taking Spark to a wide set of users, and usability and performance improvements we have made to the engine in response.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sahli:2015:SLS, author = "Majed Sahli and Essam Mansour and Panos Kalnis", title = "{StarDB}: a large-scale {DBMS} for strings", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1844--1847", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824082", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Strings and applications using them are proliferating in science and business. Currently, strings are stored in file systems and processed using ad-hoc procedural code. Existing techniques are not flexible and cannot efficiently handle complex queries or large datasets. In this paper, we demonstrate StarDB, a distributed database system for analytics on strings. StarDB hides data and system complexities and allows users to focus on analytics. It uses a comprehensive set of parallel string operations and provides a declarative query language to solve complex queries. StarDB automatically tunes itself and runs with over 90\% efficiency on supercomputers, public clouds, clusters, and workstations. We test StarDB using real datasets that are 2 orders of magnitude larger than the datasets reported by previous works.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Harbi:2015:ESQ, author = "Razen Harbi and Ibrahim Abdelaziz and Panos Kalnis and Nikos Mamoulis", title = "Evaluating {SPARQL} queries on massive {RDF} datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1848--1851", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824083", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed RDF systems partition data across multiple computer nodes. Partitioning is typically based on heuristics that minimize inter-node communication and it is performed in an initial, data pre-processing phase. Therefore, the resulting partitions are static and do not adapt to changes in the query workload; as a result, existing systems are unable to consistently avoid communication for queries that are not favored by the initial data partitioning. Furthermore, for very large RDF knowledge bases, the partitioning phase becomes prohibitively expensive, leading to high startup costs. In this paper, we propose AdHash, a distributed RDF system which addresses the shortcomings of previous work. First, AdHash initially applies lightweight hash partitioning, which drastically minimizes the startup cost, while favoring the parallel processing of join patterns on subjects, without any data communication. Using a locality-aware planner, queries that cannot be processed in parallel are evaluated with minimal communication. Second, AdHash monitors the data access patterns and adapts dynamically to the query load by incrementally redistributing and replicating frequently accessed data. As a result, the communication cost for future queries is drastically reduced or even eliminated. Our experiments with synthetic and real data verify that AdHash (i) starts faster than all existing systems, (ii) processes thousands of queries before other systems become online, and (iii) gracefully adapts to the query load, being able to evaluate queries on billion-scale RDF data in sub-seconds. In this demonstration, audience can use a graphical interface of AdHash to verify its performance superiority compared to state-of-the-art distributed RDF systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kou:2015:TBR, author = "Ngai Meng Kou and Leong Hou U. and Nikos Mamoulis and Yuhong Li and Ye Li and Zhiguo Gong", title = "A topic-based reviewer assignment system", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1852--1855", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824084", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Peer reviewing is a widely accepted mechanism for assessing the quality of submitted articles to scientific conferences or journals. Conference management systems (CMS) are used by conference organizers to invite appropriate reviewers and assign them to submitted papers. Typical CMS rely on paper bids entered by the reviewers and apply simple matching algorithms to compute the paper assignment. In this paper, we demonstrate our Reviewer Assignment System (RAS), which has advanced features compared to broadly used CMSs. First, RAS automatically extracts the profiles of reviewers and submissions in the form of topic vectors. These profiles can be used to automatically assign reviewers to papers without relying on a bidding process, which can be tedious and error-prone. Second, besides supporting classic assignment models (e.g., stable marriage and optimal assignment), RAS includes a recently published assignment model by our research group, which maximizes, for each paper, the coverage of its topics by the profiles of its reviewers. The features of the demonstration include (1) automatic extraction of paper and reviewer profiles, (2) assignment computation by different models, and (3) visualization of the results by different models, in order to assess their effectiveness.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liroz-Gistau:2015:FHE, author = "Miguel Liroz-Gistau and Reza Akbarinia and Patrick Valduriez", title = "{FP--Hadoop}: efficient execution of parallel jobs over skewed data", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1856--1859", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824085", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Big data parallel frameworks, such as MapReduce or Spark have been praised for their high scalability and performance, but show poor performance in the case of data skew. There are important cases where a high percentage of processing in the reduce side ends up being done by only one node. In this demonstration, we illustrate the use of FP-Hadoop, a system that efficiently deals with data skew in MapReduce jobs. In FP-Hadoop, there is a new phase, called intermediate reduce (IR), in which blocks of intermediate values, constructed dynamically, are processed by intermediate reduce workers in parallel, by using a scheduling strategy. Within the IR phase, even if all intermediate values belong to only one key, the main part of the reducing work can be done in parallel using the computing resources of all available workers. We implemented a prototype of FP-Hadoop, and conducted extensive experiments over synthetic and real datasets. We achieve excellent performance gains compared to native Hadoop, e.g. more than 10 times in reduce time and 5 times in total execution time. During our demonstration, we give the users the possibility to execute and compare job executions in FP-Hadoop and Hadoop. They can retrieve general information about the job and the tasks and a summary of the phases. They can also visually compare different configurations to explore the difference between the approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Papenbrock:2015:DPM, author = "Thorsten Papenbrock and Tanja Bergmann and Moritz Finke and Jakob Zwiener and Felix Naumann", title = "Data profiling with {Metanome}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1860--1863", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824086", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data profiling is the discipline of discovering metadata about given datasets. The metadata itself serve a variety of use cases, such as data integration, data cleansing, or query optimization. Due to the importance of data profiling in practice, many tools have emerged that support data scientists and IT professionals in this task. These tools provide good support for profiling statistics that are easy to compute, but they are usually lacking automatic and efficient discovery of complex statistics, such as inclusion dependencies, unique column combinations, or functional dependencies. We present Metanome, an extensible profiling platform that incorporates many state-of-the-art profiling algorithms. While Metanome is able to calculate simple profiling statistics in relational data, its focus lies on the automatic discovery of complex metadata. Metanome's goal is to provide novel profiling algorithms from research, perform comparative evaluations, and to support developers in building and testing new algorithms. In addition, Metanome is able to rank profiling results according to various metrics and to visualize the, at times, large metadata sets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kumar:2015:DSO, author = "Arun Kumar and Mona Jalal and Boqun Yan and Jeffrey Naughton and Jignesh M. Patel", title = "Demonstration of {Santoku}: optimizing machine learning over normalized data", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1864--1867", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824087", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Advanced analytics is a booming area in the data management industry and a hot research topic. Almost all toolkits that implement machine learning (ML) algorithms assume that the input is a single table, but most relational datasets are not stored as single tables due to normalization. Thus, analysts often join tables to obtain a denormalized table. Also, analysts typically ignore any functional dependencies among features because ML toolkits do not support them. In both cases, time is wasted in learning over data with redundancy. We demonstrate Santoku, a toolkit to help analysts improve the performance of ML over normalized data. Santoku applies the idea of factorized learning and automatically decides whether to denormalize or push ML computations through joins. Santoku also exploits database dependencies to provide automatic insights that could help analysts with exploratory feature selection. It is usable as a library in R, which is a popular environment for advanced analytics. We demonstrate the benefits of Santoku in improving ML performance and helping analysts with feature selection.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Seah:2015:PCP, author = "Boon Siew Seah and Sourav S. Bhowmick and Aixin Sun", title = "{PRISM}: concept-preserving summarization of top-$k$ social image search results", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1868--1871", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824088", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Most existing tag-based social image search engines present search results as a ranked list of images, which cannot be consumed by users in a natural and intuitive manner. In this demonstration, we present a novel concept-preserving image search results summarization system called prism. prism exploits both visual features and tags of the search results to generate high quality summary, which not only breaks the results into visually and semantically coherent clusters but it also maximizes the coverage of the original top- k search results. It first constructs a visual similarity graph where the nodes are images in the top- k search results and the edges represent visual similarities between pairs of images. This graph is optimally decomposed and compressed into a set of concept-preserving subgraphs based on a set of summarization criteria. One or more exemplar images from each subgraph is selected to form the exemplar summary of the result set. We demonstrate various innovative features of prism and the promise of superior quality summary construction of social image search results.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Muller:2015:PST, author = "Tobias M{\"u}ller and Torsten Grust", title = "Provenance for {SQL} through abstract interpretation: value-less, but worthwhile", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1872--1875", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824089", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate the derivation of fine-grained where --- and why -provenance for a rich dialect of SQL that includes recursion, (correlated) subqueries, windows, grouping/aggregation, and the RDBMS's library of built-in functions. The approach relies on ideas that originate in the programming language community---program slicing and abstract interpretation, in particular. A two-stage process first records a query's control flow decisions and locations of data access before it derives provenance without consultation of the actual data values (rendering the method largely ``value-less''). We will bring an interactive demonstrator that uses this provenance information to make input/output dependencies in real-world SQL queries tangible.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{He:2015:SSQ, author = "Zhian He and Wai Kit Wong and Ben Kao and David Wai Lok Cheung and Rongbin Li and Siu Ming Yiu and Eric Lo", title = "{SDB}: a secure query processing system with data interoperability", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1876--1879", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824090", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We address security issues in a cloud database system which employs the DBaaS model --- a data owner (DO) exports data to a cloud database service provider (SP). To provide data security, sensitive data is encrypted by the DO before it is uploaded to the SP. Compared to existing secure query processing systems like CryptDB [7] and MONOMI [8], in which data operations (e.g., comparison or addition) are supported by specialized encryption schemes, our demo system, SDB, is implemented based on a set of data-interoperable secure operators, i.e., the output of an operator can be used as input of another operator. As a result, SDB can support a wide range of complex queries (e.g., all TPC-H queries) efficiently. In this demonstration, we show how our SDB prototype supports secure query processing on complex workload like TPC-H. We also demonstrate how our system protects sensitive information from malicious attackers.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abdelaziz:2015:SVC, author = "Ibrahim Abdelaziz and Razen Harbi and Semih Salihoglu and Panos Kalnis and Nikos Mamoulis", title = "{SPARTex}: a vertex-centric framework for {RDF} data analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1880--1883", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824091", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A growing number of applications require combining SPARQL queries with generic graph search on RDF data. However, the lack of procedural capabilities in SPARQL makes it inappropriate for graph analytics. Moreover, RDF engines focus on SPARQL query evaluation whereas graph management frameworks perform only generic graph computations. In this work, we bridge the gap by introducing SPARTex, an RDF analytics framework based on the vertex-centric computation model. In SPARTex, user-defined vertex centric programs can be invoked from SPARQL as stored procedures. SPARTex allows the execution of a pipeline of graph algorithms without the need for multiple reads/writes of input data and intermediate results. We use a cost-based optimizer for minimizing the communication cost. SPARTex evaluates queries that combine SPARQL and generic graph computations orders of magnitude faster than existing RDF engines. We demonstrate a real system prototype of SPARTex running on a local cluster using real and synthetic datasets. SPARTex has a real-time graphical user interface that allows the participants to write regular SPARQL queries, use our proposed SPARQL extension to declaratively invoke graph algorithms or combine/pipeline both SPARQL querying and generic graph analytics.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2015:IDG, author = "Lu Chen and Yunjun Gao and Zhihao Xing and Christian S. Jensen and Gang Chen", title = "{I2RS}: a distributed geo-textual image retrieval and recommendation system", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1884--1887", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824092", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Massive amounts of geo-tagged and textually annotated images are provided by online photo services such as Flickr and Zommr. However, most existing image retrieval engines only consider text annotations. We present I2RS, a system that allows users to view geo-textual images on Google Maps, find hot topics within a specific geographic region and time period, retrieve images similar to a query image, and receive recommended images that they might be interested in. I2RS is a distributed geo-textual image retrieval and recommendation system that employs SPB-trees to index geo-textual images, and that utilizes metric similarity queries, including top-$m$ spatio-temporal range and k nearest neighbor queries, to support geo-textual image retrieval and recommendation. The system adopts the browser-server model, whereas the server is deployed in a distributed environment that enables efficiency and scalability to huge amounts of data and requests. A rich set of 100 million geo-textual images crawled from Flickr is used to demonstrate that, I2RS can return high-quality answers in an interactive way and support efficient updates for high image arrival rates.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bursztyn:2015:RBQ, author = "Damian Bursztyn and Fran{\c{c}}ois Goasdou{\'e} and Ioana Manolescu", title = "Reformulation-based query answering in {RDF}: alternatives and performance", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1888--1891", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824093", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Answering queries over Semantic Web data, i.e., RDF graphs, must account for both explicit data and implicit data, entailed by the explicit data and the semantic constraints holding on them. Two main query answering techniques have been devised, namely Saturation -based (S at) which precomputes and adds to the graph all implicit information, and Reformulation -based (Ref) which reformulates the query based on the graph constraints, so that evaluating the reformulated query directly against the explicit data (i.e., without considering the constraints) produces the query answer. While S at is well known, Ref has received less attention so far. In particular, reformulated queries often perform poorly if the query is complex. Our demonstration showcases a large set of Ref techniques, including but not limited to one we proposed recently. The audience will be able to 1: test them against different datasets, constraints and queries, as well as different well-established systems, 2: analyze and understand the performance challenges they raise, and 3: alter the scenarios to visualize the impact on performance. In particular, we show how a cost-based Ref approach allows avoiding reformulation performance pitfalls.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bux:2015:SSS, author = "Marc Bux and J{\"o}rgen Brandt and Carsten Lipka and Kamal Hakimzadeh and Jim Dowling and Ulf Leser", title = "{SAASFEE}: scalable scientific workflow execution engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1892--1895", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824094", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Across many fields of science, primary data sets like sensor read-outs, time series, and genomic sequences are analyzed by complex chains of specialized tools and scripts exchanging intermediate results in domain-specific file formats. Scientific workflow management systems (SWfMSs) support the development and execution of these tool chains by providing workflow specification languages, graphical editors, fault-tolerant execution engines, etc. However, many SWfMSs are not prepared to handle large data sets because of inadequate support for distributed computing. On the other hand, most SWfMSs that do support distributed computing only allow static task execution orders. We present SAASFEE, a SWfMS which runs arbitrarily complex workflows on Hadoop YARN. Workflows are specified in Cuneiform, a functional workflow language focusing on parallelization and easy integration of existing software. Cuneiform workflows are executed on Hi-WAY, a higher-level scheduler for running workflows on YARN. Distinct features of SAASFEE are the ability to execute iterative workflows, an adaptive task scheduler, re-executable provenance traces, and compatibility to selected other workflow systems. In the demonstration, we present all components of SAASFEE using real-life workflows from the field of genomics.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Eldawy:2015:DHE, author = "Ahmed Eldawy and Mohamed F. Mokbel and Christopher Jonathan", title = "A demonstration of {HadoopViz}: an extensible {MapReduce} system for visualizing big spatial data", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1896--1899", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824095", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This demonstration presents HadoopViz; an extensible MapReduce-based system for visualizing Big Spatial Data. HadoopViz has two main unique features that distinguish it from other techniques. (1) It provides an extensible interface that allows users to visualize various types of data by defining five abstract functions, without delving into the details of the MapReduce algorithms. We show how it is used to create four types of visualizations, namely, scatter plot, road network, frequency heat map, and temperature heat map. (2) HadoopViz is capable of generating big images with giga-pixel resolution by employing a three-phase approach of partitioning, rasterize, and merging. HadoopViz generates single and multi-level images, where the latter allows users to zoom in/out to get more/less details. Both types of images are generated with a very high resolution using the extensible and scalable framework of HadoopViz.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bergman:2015:QQO, author = "Moria Bergman and Tova Milo and Slava Novgorodov and Wang-Chiew Tan", title = "{QOCO}: a query oriented data cleaning system with oracles", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1900--1903", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824096", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As key decisions are often made based on information contained in a database, it is important for the database to be as complete and correct as possible. For this reason, many data cleaning tools have been developed to automatically resolve inconsistencies in databases. However, data cleaning tools provide only best-effort results and usually cannot eradicate all errors that may exist in a database. Even more importantly, existing data cleaning tools do not typically address the problem of determining what information is missing from a database. To tackle these problems, we present QOCO, a novel query oriented cleaning system that leverages materialized views that are defined by user queries as a trigger for identifying the remaining incorrect/missing information. Given a user query, QOCO interacts with domain experts (which we model as oracle crowds) to identify potentially wrong or missing answers in the result of the user query, as well as determine and correct the wrong data that is the cause for the error(s). We will demonstrate QOCO over a World Cup Games database, and illustrate the interaction between QOCO and the oracles. Our demo audience will play the role of oracles, and we show how QOCO's underlying operations and optimization mechanisms can effectively prune the search space and minimize the number of questions that need to be posed to accelerate the cleaning process.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ying:2015:TFS, author = "Shanshan Ying and Flip Korn and Barna Saha and Divesh Srivastava", title = "{TreeScope}: finding structural anomalies in semi-structured data", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1904--1907", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824097", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Semi-structured data are prevalent on the web, with formats such as XML and JSON soaring in popularity due to their generality, flexibility and easy customization. However, these very same features make semi-structured data prone to a range of data quality errors, from errors in content to errors in structure. While the former has been well studied, little attention has been paid to structural errors. In this demonstration, we present T reeScope, which analyzes semi-structured data sets with the goal of automatically identifying structural anomalies from the data. Our techniques learn robust structural models that have high support, to identify potential errors in the structure. Identified structural anomalies are then concisely summarized to provide plausible explanations of the potential errors. The goal of this demonstration is to enable an interactive exploration of the process of identifying and summarizing structural anomalies in semi-structured data sets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Elmore:2015:DBP, author = "A. Elmore and J. Duggan and M. Stonebraker and M. Balazinska and U. Cetintemel and V. Gadepally and J. Heer and B. Howe and J. Kepner and T. Kraska and S. Madden and D. Maier and T. Mattson and S. Papadopoulos and J. Parkhurst and N. Tatbul and M. Vartak and S. Zdonik", title = "A demonstration of the {BigDAWG} polystore system", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1908--1911", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824098", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents BigDAWG, a reference implementation of a new architecture for ``Big Data'' applications. Such applications not only call for large-scale analytics, but also for real-time streaming support, smaller analytics at interactive speeds, data visualization, and cross-storage-system queries. Guided by the principle that ``one size does not fit all'', we build on top of a variety of storage engines, each designed for a specialized use case. To illustrate the promise of this approach, we demonstrate its effectiveness on a hospital application using data from an intensive care unit (ICU). This complex application serves the needs of doctors and researchers and provides real-time support for streams of patient data. It showcases novel approaches for querying across multiple storage engines, data visualization, and scalable real-time analytics.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zoumpatianos:2015:RID, author = "Kostas Zoumpatianos and Stratos Idreos and Themis Palpanas", title = "{RINSE}: interactive data series exploration with {ADS+}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1912--1915", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824099", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Numerous applications continuously produce big amounts of data series, and in several time critical scenarios analysts need to be able to query these data as soon as they become available. An adaptive index data structure, ADS+, which is specifically tailored to solve the problem of indexing and querying very large data series collections has been recently proposed as a solution to this problem. The main idea is that instead of building the complete index over the complete data set up-front and querying only later, we interactively and adaptively build parts of the index, only for the parts of the data on which the users pose queries. The net effect is that instead of waiting for extended periods of time for the index creation, users can immediately start exploring the data series. In this work, we present a demonstration of ADS+; we introduce RINSE, a system that allows users to experience the benefits of the ADS+ adaptive index through an intuitive web interface. Users can explore large datasets and find patterns of interest, using nearest neighbor search. They can draw queries (data series) using a mouse, or touch screen, or they can select from a predefined list of data series. RINSE can scale to large data sizes, while drastically reducing the data to query delay: by the time state-of-the-art indexing techniques finish indexing 1 billion data series (and before answering even a single query), adaptive data series indexing can already answer 3 * 10$^5$ queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bhardwaj:2015:CDA, author = "Anant Bhardwaj and Amol Deshpande and Aaron J. Elmore and David Karger and Sam Madden and Aditya Parameswaran and Harihar Subramanyam and Eugene Wu and Rebecca Zhang", title = "Collaborative data analytics with {DataHub}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1916--1919", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824100", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "While there have been many solutions proposed for storing and analyzing large volumes of data, all of these solutions have limited support for collaborative data analytics, especially given the many individuals and teams are simultaneously analyzing, modifying and exchanging datasets, employing a number of heterogeneous tools or languages for data analysis, and writing scripts to clean, preprocess, or query data. We demonstrate DataHub, a unified platform with the ability to load, store, query, collaboratively analyze, interactively visualize, interface with external applications, and share datasets. We will demonstrate the following aspects of the DataHub platform: (a) flexible data storage, sharing, and native versioning capabilities: multiple conference attendees can concurrently update the database and browse the different versions and inspect conflicts; (b) an app ecosystem that hosts apps for various data-processing activities: conference attendees will be able to effortlessly ingest, query, and visualize data using our existing apps; (c) thrift-based data serialization permits data analysis in any combination of 20+ languages, with DataHub as the common data store: conference attendees will be able to analyze datasets in R, Python, and Matlab, while the inputs and the results are still stored in DataHub. In particular, conference attendees will be able to use the DataHub notebook ---an IPython-based notebook for analyzing data and storing the results of data analysis.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shin:2015:MDD, author = "Jaeho Shin and Christopher R{\'e} and Michael Cafarella", title = "{Mindtagger}: a demonstration of data labeling in knowledge base construction", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1920--1923", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824101", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "End-to-end knowledge base construction systems using statistical inference are enabling more people to automatically extract high-quality domain-specific information from unstructured data. As a result of deploying DeepDive framework across several domains, we found new challenges in debugging and improving such end-to-end systems to construct high-quality knowledge bases. DeepDive has an iterative development cycle in which users improve the data. To help our users, we needed to develop principles for analyzing the system's error as well as provide tooling for inspecting and labeling various data products of the system. We created guidelines for error analysis modeled after our colleagues' best practices, in which data labeling plays a critical role in every step of the analysis. To enable more productive and systematic data labeling, we created Mindtagger, a versatile tool that can be configured to support a wide range of tasks. In this demonstration, we show in detail what data labeling tasks are modeled in our error analysis guidelines and how each of them is performed using Mindtagger.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Koutra:2015:PIL, author = "Danai Koutra and Di Jin and Yuanchi Ning and Christos Faloutsos", title = "{Perseus}: an interactive large-scale graph mining and visualization tool", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1924--1927", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824102", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a large graph with several millions or billions of nodes and edges, such as a social network, how can we explore it efficiently and find out what is in the data? In this demo we present P erseus, a large-scale system that enables the comprehensive analysis of large graphs by supporting the coupled summarization of graph properties and structures, guiding attention to outliers, and allowing the user to interactively explore normal and anomalous node behaviors. Specifically, P erseus provides for the following operations: (1) It automatically extracts graph invariants (e.g., degree, PageRank, real eigenvectors) by performing scalable, offline batch processing on Hadoop; (2) It interactively visualizes univariate and bivariate distributions for those invariants; (3) It summarizes the properties of the nodes that the user selects; (4) It efficiently visualizes the induced subgraph of a selected node and its neighbors, by incrementally revealing its neighbors. In our demonstration, we invite the audience to interact with P erseus to explore a variety of multi-million-edge social networks including a Wikipedia vote network, a friendship/foeship network in Slashdot, and a trust network based on the consumer review website Epinions.com.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Joglekar:2015:SDN, author = "Manas Joglekar and Hector Garcia-Molina and Aditya Parameswaran", title = "Smart drill-down: a new data exploration operator", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1928--1931", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824103", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present a data exploration system equipped with smart drill-down, a novel operator for interactively exploring a relational table to discover and summarize ``interesting'' groups of tuples. Each such group of tuples is represented by a rule. For instance, the rule (a, b, *, 1000) tells us that there are a thousand tuples with value a in the first column and b in the second column (and any value in the third column). Smart drill-down presents an analyst with a list of rules that together describe interesting aspects of the table. The analyst can tailor the definition of interesting, and can interactively apply smart drill-down on an existing rule to explore that part of the table. In the demonstration, conference attendees will be able to use the data exploration system equipped with smart drill-down, and will be able to contrast smart drill-down to traditional drill-down, for various interestingness measures, and resource constraints.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dyreson:2015:VED, author = "Curtis E. Dyreson and Sourav S. Bhowmick and Ryan Grapp", title = "Virtual {eXist-db}: liberating hierarchical queries from the shackles of access path dependence", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1932--1935", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824104", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "XQuery programs can be hard to write and port to new data collections because the path expressions in a query are dependent on the hierarchy of the data. We propose to demonstrate a system to liberate query writers from this dependence. A plug-and-play query contains a specification of what data the query needs in order to evaluate. We implemented virtual eXist-db to support plug-and-play XQuery queries. Our system adds a virtualDoc function that lets a programmer sketch the hierarchy needed by the query, which may well be different than what the data has, and logically (not physically) transforms the data (with information loss guarantees) to the hierarchy specified by the virtualDoc. The demonstration will consist of a sequence of XQuery queries using a virtual hierarchy, including queries suggested by the audience. We will also demonstrate a GUI tool to construct a virtual hierarchy.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cortez:2015:ADS, author = "Eli Cortez and Philip A. Bernstein and Yeye He and Lev Novik", title = "Annotating database schemas to help enterprise search", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1936--1939", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824105", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In large enterprises, data discovery is a common problem faced by users who need to find relevant information in relational databases. In this scenario, schema annotation is a useful tool to enrich a database schema with descriptive keywords. In this paper, we demonstrate Barcelos, a system that automatically annotates corporate databases. Unlike existing annotation approaches that use Web oriented knowledge bases, Barcelos mines enterprise spreadsheets to find candidate annotations. Our experimental evaluation shows that Barcelos produces high quality annotations; the top-5 have an average precision of 87\%.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jayaram:2015:VAS, author = "Nandish Jayaram and Sidharth Goyal and Chengkai Li", title = "{VIIQ}: auto-suggestion enabled visual interface for interactive graph query formulation", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1940--1943", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824106", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present VIIQ (pronounced as wick), an interactive and iterative visual query formulation interface that helps users construct query graphs specifying their exact query intent. Heterogeneous graphs are increasingly used to represent complex relationships in schemaless data, which are usually queried using query graphs. Existing graph query systems offer little help to users in easily choosing the exact labels of the edges and vertices in the query graph. VIIQ helps users easily specify their exact query intent by providing a visual interface that lets them graphically add various query graph components, backed by an edge suggestion mechanism that suggests edges relevant to the user's query intent. In this demo we present: (1) a detailed description of the various features and user-friendly graphical interface of VIIQ, (2) a brief description of the edge suggestion algorithm, and (3) a demonstration scenario that we intend to show the audience.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2015:FSS, author = "Qingyuan Liu and Eduard C. Dragut and Arjun Mukherjee and Weiyi Meng", title = "{FLORIN}: a system to support (near) real-time applications on user generated content on daily news", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1944--1947", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824107", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we propose a system, FLORIN, which provides support for near real-time applications on user generated content on daily news. FLORIN continuously crawls news outlets for articles and user comments accompanying them. It attaches the articles and comments to daily event stories. It identifies the opinionated content in user comments and performs named entity recognition on news articles. All these pieces of information are organized hierarchically and exportable to other applications. Multiple applications can be built on this data. We have implemented a sentiment analysis system that runs on top of it.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2015:VVI, author = "Yunyao Li and Elmer Kim and Marc A. Touchette and Ramiya Venkatachalam and Hao Wang", title = "{VINERy}: a visual {IDE} for information extraction", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1948--1951", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824108", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Information Extraction (IE) is the key technology enabling analytics over unstructured and semi-structured data. Not surprisingly, it is becoming a critical building block for a wide range of emerging applications. To satisfy the rising demands for information extraction in real-world applications, it is crucial to lower the barrier to entry for IE development and enable users with general computer science background to develop higher quality extractors. In this demonstration$^1$, we present VINERy, an intuitive yet expressive visual IDE for information extraction. We show how it supports the full cycle of IE development without requiring a single line of code and enables a wide range of users to develop high quality IE extractors with minimal efforts. The extractors visually built in VINERY are automatically translated into semantically equivalent extractors in a state-of-the-art declarative language for IE. We also demonstrate how the auto-generated extractors can then be imported into a conventional Eclipse-based IDE for further enhancement. The results of our user studies indicate that VINERY is a significant step forward in facilitating extractor development for both expert and novice IE developers.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chu:2015:KRD, author = "Xu Chu and John Morcos and Ihab F. Ilyas and Mourad Ouzzani and Paolo Papotti and Nan Tang and Yin Ye", title = "{KATARA}: reliable data cleaning with knowledge bases and crowdsourcing", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1952--1955", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824109", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data cleaning with guaranteed reliability is hard to achieve without accessing external sources, since the truth is not necessarily discoverable from the data at hand. Furthermore, even in the presence of external sources, mainly knowledge bases and humans, effectively leveraging them still faces many challenges, such as aligning heterogeneous data sources and decomposing a complex task into simpler units that can be consumed by humans. We present K atara, a novel end-to-end data cleaning system powered by knowledge bases and crowdsourcing. Given a table, a kb, and a crowd, Katara (i) interprets the table semantics w.r.t. the given kb; (ii) identifies correct and wrong data; and (iii) generates top- k possible repairs for the wrong data. Users will have the opportunity to experience the following features of Katara: (1) Easy specification: Users can define a Katara job with a browser-based specification; (2) Pattern validation: Users can help the system to resolve the ambiguity of different table patterns (i.e., table semantics) discovered by Katara; (3) Data annotation: Users can play the role of internal crowd workers, helping Katara annotate data. Moreover, Katara will visualize the annotated data as correct data validated by the kb, correct data jointly validated by the kb and the crowd, or erroneous tuples along with their possible repairs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Alvanaki:2015:GNB, author = "Foteini Alvanaki and Romulo Goncalves and Milena Ivanova and Martin Kersten and Kostis Kyzirakos", title = "{GIS} navigation boosted by column stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1956--1959", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824110", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Earth observation sciences, astronomy, and seismology have large data sets which have inherently rich spatial and geospatial information. In combination with large collections of semantically rich objects which have a large number of thematic properties, they form a new source of knowledge for urban planning, smart cities and natural resource management. Modeling and storing these properties indicating the relationships between them is best handled in a relational database. Furthermore, the scalability requirements posed by the latest 26-attribute light detection and ranging (LIDAR) data sets are a challenge for file-based solutions. In this demo we show how to query a 640 billion point data set using a column store enriched with GIS functionality. Through a lightweight and cache conscious secondary index called Imprints, spatial queries performance on a flat table storage is comparable to traditional file-based solutions. All the results are visualised in real time using QGIS.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Arocena:2015:GCY, author = "Patricia C. Arocena and Radu Ciucanu and Boris Glavic and Ren{\'e}e J. Miller", title = "Gain control over your integration evaluations", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1960--1963", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824111", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Integration systems are typically evaluated using a few real-world scenarios (e.g., bibliographical or biological datasets) or using synthetic scenarios (e.g., based on star-schemas or other patterns for schemas and constraints). Reusing such evaluations is a cumbersome task because their focus is usually limited to showcasing a specific feature of an approach. This makes it difficult to compare integration solutions, understand their generality, and understand their performance for different application scenarios. Based on this observation, we demonstrate some of the requirements for developing integration benchmarks. We argue that the major abstractions used for integration problems have converged in the last decade which enables the application of robust empirical methods to integration problems (from schema evolution, to data exchange, to answering queries using views and many more). Specifically, we demonstrate that schema mappings are the main abstraction that now drives most integration solutions and show how a metadata generator can be used to create more credible evaluations of the performance and scalability of data integration systems. We will use the demonstration to evangelize for more robust, shared empirical evaluations of data integration systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Diao:2015:AAU, author = "Yanlei Diao and Kyriaki Dimitriadou and Zhan Li and Wenzhao Liu and Olga Papaemmanouil and Kemi Peng and Liping Peng", title = "{AIDE}: an automatic user navigation system for interactive data exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1964--1967", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824112", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data analysts often engage in data exploration tasks to discover interesting data patterns, without knowing exactly what they are looking for. Such exploration tasks can be very labor-intensive because they often require the user to review many results of ad-hoc queries and adjust the predicates of subsequent queries to balance the tradeoff between collecting all interesting information and reducing the size of returned data. In this demonstration we introduce AIDE, a system that automates these exploration tasks. AIDE steers the user towards interesting data areas based on her relevance feedback on database samples, aiming to achieve the goal of identifying all database objects that match the user interest with high efficiency. In our demonstration, conference attendees will see AIDE in action for a variety of exploration tasks on real-world datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Aly:2015:DAA, author = "Ahmed M. Aly and Ahmed S. Abdelhamid and Ahmed R. Mahmood and Walid G. Aref and Mohamed S. Hassan and Hazem Elmeleegy and Mourad Ouzzani", title = "A demonstration of {AQWA}: adaptive query-workload-aware partitioning of big spatial data", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1968--1971", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824113", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The ubiquity of location-aware devices, e.g., smartphones and GPS devices, has led to a plethora of location-based services in which huge amounts of geotagged information need to be efficiently processed by large-scale computing clusters. This demo presents AQWA, an adaptive and query-workload-aware data partitioning mechanism for processing large-scale spatial data. Unlike existing cluster-based systems, e.g., SpatialHadoop, that apply static partitioning of spatial data, AQWA has the ability to react to changes in the query-workload and data distribution. A key feature of AQWA is that it does not assume prior knowledge of the query-workload or data distribution. Instead, AQWA reacts to changes in both the data and the query-workload by incrementally updating the partitioning of the data. We demonstrate two prototypes of AQWA deployed over Hadoop and Spark. In both prototypes, we process spatial range and k -nearest-neighbor (k NN, for short) queries over large-scale spatial datasets, and we exploit the performance of AQWA under different query-workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dittrich:2015:JID, author = "Jens Dittrich and Patrick Bender", title = "Janiform intra-document analytics for reproducible research", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1972--1975", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824114", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Peer-reviewed publication of research papers is a cornerstone of science. However, one of the many issues of our publication culture is that our publications only publish a summary of the final result of a long project. This means that we put well-polished graphs describing (some) of our experimental results into our publications. However, the algorithms, input datasets, benchmarks, raw result datasets, as well as scripts that were used to produce the graphs in the first place are rarely published and typically not available to other researchers. Often they are only available when personally asking the authors. In many cases, however, they are not available at all. This means from a long workflow that led to producing a graph for a research paper, we only publish the final result rather than the entire workflow. This is unfortunate and has been criticized in various scientific communities. In this demo we argue that one part of the problem is our dated view on what a ``document'' and hence ``a publication'' is, should, and can be. As a remedy, we introduce portable database files (PDbF). These files are janiform, i.e. they are at the same time a standard static pdf as well as a highly dynamic (offline) HTML-document. PDbFs allow you to access the raw data behind a graph, perform OLAP-style analysis, and reproduce your own graphs from the raw data --- all of this within a portable document. We demo a tool allowing you to create PDbFs smoothly from within L$^A$ T$_E$ X. This tool allows you to preserve the workflow of raw measurement data to its final graphical output through all processing steps. Notice that this pdf already showcases our technology: rename this file to ``.html'' and see what happens (currently we support the desktop versions of Firefox, Chrome, and Safari). But please: do not try to rename this file to ``.ova'' and mount it in VirtualBox.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Schubert:2015:FCU, author = "Erich Schubert and Alexander Koos and Tobias Emrich and Andreas Z{\"u}fle and Klaus Arthur Schmid and Arthur Zimek", title = "A framework for clustering uncertain data", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1976--1979", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824115", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The challenges associated with handling uncertain data, in particular with querying and mining, are finding increasing attention in the research community. Here we focus on clustering uncertain data and describe a general framework for this purpose that also allows to visualize and understand the impact of uncertainty---using different uncertainty models---on the data mining results. Our framework constitutes release 0.7 of ELKI (http://elki.dbs.ifi.lmu.de/) and thus comes along with a plethora of implementations of algorithms, distance measures, indexing techniques, evaluation measures and visualization components.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bidoit:2015:EWA, author = "Nicole Bidoit and Melanie Herschel and Katerina Tzompanaki", title = "{EFQ}: why-not answer polynomials in action", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1980--1983", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824116", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "One important issue in modern database applications is supporting the user with efficient tools to debug and fix queries because such tasks are both time and skill demanding. One particular problem is known as Why-Not question and focusses on the reasons for missing tuples from query results. The EFQ platform demonstrated here has been designed in this context to efficiently leverage Why-Not Answers polynomials, a novel approach that provides the user with complete explanations to Why-Not questions and allows for automatic, relevant query refinements.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2015:EDD, author = "Xiaolan Wang and Mary Feng and Yue Wang and Xin Luna Dong and Alexandra Meliou", title = "Error diagnosis and data profiling with {DataXRay}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1984--1987", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824117", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The problem of identifying and repairing data errors has been an area of persistent focus in data management research. However, while traditional data cleaning techniques can be effective at identifying several data discrepancies, they disregard the fact that many errors are systematic, inherent to the process that produces the data, and thus will keep occurring unless the root cause is identified and corrected. In this demonstration, we will present a large-scale diagnostic framework called D ataXRay. Like a medical X-ray that aids the diagnosis of medical conditions by revealing problems underneath the surface, DataXRay reveals hidden connections and common properties among data errors. Thus, in contrast to traditional cleaning methods, which treat the symptoms, our system investigates the underlying conditions that cause the errors. The core of D ataXRay combines an intuitive and principled cost model derived by Bayesian analysis, and an efficient, highly-parallelizable diagnostic algorithm that discovers common properties among erroneous data elements in a top-down fashion. Our system has a simple interface that allows users to load different datasets, to interactively adjust key diagnostic parameters, to explore the derived diagnoses, and to compare with solutions produced by alternative algorithms. Through this demonstration, participants will understand (1) the characteristics of good diagnoses, (2) how and why errors occur in real-world datasets, and (3) the distinctions with other related problems and approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pham:2015:SRD, author = "Quan Pham and Severin Thaler and Tanu Malik and Ian Foster and Boris Glavic", title = "Sharing and reproducing database applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1988--1991", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824118", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Sharing and repeating scientific applications is crucial for verifying claims, reproducing experimental results (e.g., to repeat a computational experiment described in a publication), and promoting reuse of complex applications. The predominant methods of sharing and making applications repeatable are building a companion web site and/or provisioning a virtual machine image (VMI). Recently, application virtualization (AV), has emerged as a light-weight alternative for sharing and efficient repeatability. AV approaches such as Linux Containers create a chroot-like environment [4], while approaches such as CDE [1] trace system calls during application execution to copy all binaries, data, and software dependencies into a self-contained package.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wylot:2015:DTT, author = "Marcin Wylot and Philippe Cudr{\'e}-Mauroux and Paul Groth", title = "A demonstration of {TripleProv}: tracking and querying provenance over {Web} data", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1992--1995", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824119", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The proliferation of heterogeneous Linked Data on the Web poses new challenges to database systems. In particular, the capacity to store, track, and query provenance data is becoming a pivotal feature of modern triple stores. In this demonstration, we present TripleProv: a new system extending a native RDF store to efficiently handle the storage, tracking and querying of provenance in RDF data. In the following, we give an overview of our approach providing a reliable and understandable specification of the way results were derived from the data and how particular pieces of data were combined to answer the query. Subsequently, we present techniques enabling to tailor queries with provenance data. Finally, we describe our demonstration and how the attendees will be able to interact with our system during the conference.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ortona:2015:WJW, author = "Stefano Ortona and Giorgio Orsi and Marcello Buoncristiano and Tim Furche", title = "{WADaR}: joint wrapper and data repair", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "1996--1999", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824120", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Web scraping (or wrapping) is a popular means for acquiring data from the web. Recent advancements have made scalable wrapper-generation possible and enabled data acquisition processes involving thousands of sources. This makes wrapper analysis and maintenance both needed and challenging as no scalable tools exists that support these tasks. We demonstrate WADaR, a scalable and highly automated tool for joint wrapper and data repair. WADaR uses off-the-shelf entity recognisers to locate target entities in wrapper-generated data. Markov chains are used to determine structural repairs, that are then encoded into suitable repairs for both the data and corresponding wrappers. We show that WADaR is able to increase the quality of wrapper-generated relations between 15\% and 60\%, and to fully repair the corresponding wrapper without any knowledge of the original website in more than 50\% of the cases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bendre:2015:DUD, author = "Mangesh Bendre and Bofan Sun and Ding Zhang and Xinyan Zhou and Kevin Chen-Chuan Chang and Aditya Parameswaran", title = "{DataSpread}: unifying databases and spreadsheets", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2000--2003", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824121", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Spreadsheet software is often the tool of choice for ad-hoc tabular data management, processing, and visualization, especially on tiny data sets. On the other hand, relational database systems offer significant power, expressivity, and efficiency over spreadsheet software for data management, while lacking in the ease of use and ad-hoc analysis capabilities. We demonstrate D ataSpread, a data exploration tool that holistically unifies databases and spreadsheets. It continues to offer a Microsoft Excel-based spreadsheet front-end, while in parallel managing all the data in a back-end database, specifically, PostgreSQL. DataSpread retains all the advantages of spreadsheets, including ease of use, ad-hoc analysis and visualization capabilities, and a schema-free nature, while also adding the advantages of traditional relational databases, such as scalability and the ability to use arbitrary SQL to import, filter, or join external or internal tables and have the results appear in the spreadsheet. DataSpread needs to reason about and reconcile differences in the notions of schema, addressing of cells and tuples, and the current ``pane'' (which exists in spreadsheets but not in traditional databases), and support data modifications at both the front-end and the back-end. Our demonstration will center on our first and early prototype of the DataSpread, and will give the attendees a sense for the enormous data exploration capabilities offered by unifying spreadsheets and databases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Haas:2015:WNS, author = "Daniel Haas and Sanjay Krishnan and Jiannan Wang and Michael J. Franklin and Eugene Wu", title = "{Wisteria}: nurturing scalable data cleaning infrastructure", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2004--2007", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824122", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Analysts report spending upwards of 80\% of their time on problems in data cleaning. The data cleaning process is inherently iterative, with evolving cleaning workflows that start with basic exploratory data analysis on small samples of dirty data, then refine analysis with more sophisticated/expensive cleaning operators (e.g., crowdsourcing), and finally apply the insights to a full dataset. While an analyst often knows at a logical level what operations need to be done, they often have to manage a large search space of physical operators and parameters. We present Wisteria, a system designed to support the iterative development and optimization of data cleaning workflows, especially ones that utilize the crowd. Wisteria separates logical operations from physical implementations, and driven by analyst feedback, suggests optimizations and/or replacements to the analyst's choice of physical implementation. We highlight research challenges in sampling, in-flight operator replacement, and crowdsourcing. We overview the system architecture and these techniques, then provide a demonstration designed to showcase how Wisteria can improve iterative data analysis and cleaning. The code is available at: http://www.sampleclean.org.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{S:2015:CDA, author = "Ashoke S. and Jayant R. Haritsa", title = "{CODD}: a dataless approach to big data testing", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2008--2011", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824123", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The construction and development of the so-called Big Data systems has occupied centerstage in the data management community in recent years. However, there has been comparatively little attention paid to the testing of such systems, an essential pre-requisite for successful deployment. This is surprising given that traditional testing techniques, which typically involve construction of representative databases and regression query suites, are completely impractical at Big Data scale --- simply due to the time and space overheads involved in their execution. For instance, consider the situation where a database engineer wishes to evaluate the query optimizer's behavior on a futuristic Big Data setup featuring ``yottabyte'' ($ 10^{24} $ bytes) sized relational tables. Obviously, just generating this data, let alone storing it, is practically infeasible even on the best of systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cebiric:2015:QOS, author = "Sejla Cebiri{\'c} and Fran{\c{c}}ois Goasdou{\'e} and Ioana Manolescu", title = "Query-oriented summarization of {RDF} graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2012--2015", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824124", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Resource Description Framework (RDF) is a graph-based data model promoted by the W3C as the standard for Semantic Web applications. Its associated query language is SPARQL. RDF graphs are often large and varied, produced in a variety of contexts, e.g., scientific applications, social or online media, government data etc. They are heterogeneous, i.e., resources described in an RDF graph may have very different sets of properties. An RDF resource may have: no types, one or several types (which may or may not be related to each other). RDF Schema (RDFS) information may optionally be attached to an RDF graph, to enhance the description of its resources. Such statements also entail that in an RDF graph, some data is implicit. According to the W3C RDF and SPARQL specification, the semantics of an RDF graph comprises both its explicit and implicit data; in particular, SPARQL query answers must be computed reflecting both the explicit and implicit data. These features make RDF graphs complex, both structurally and conceptually. It is intrinsically hard to get familiar with a new RDF dataset, especially if an RDF schema is sparse or not available at all.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chodpathumwan:2015:UDT, author = "Yodsawalai Chodpathumwan and Amirhossein Aleyasen and Arash Termehchy and Yizhou Sun", title = "{Universal-DB}: towards representation independent graph analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2016--2019", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824125", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph analytics algorithms leverage quantifiable structural properties of the data to predict interesting concepts and relationships. The same information, however, can be represented using many different structures and the structural properties observed over particular representations do not necessarily hold for alternative structures. Because these algorithms tend to be highly effective over some choices of structure, such as that of the databases used to validate them, but not so effective with others, graph analytics has largely remained the province of experts who can find the desired forms for these algorithms. We argue that in order to make graph analytics usable, we should develop systems that are effective over a wide range of choices of structural organizations. We demonstrate Universal-DB an entity similarity and proximity search system that returns the same answers for a query over a wide range of choices to represent the input database.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mahmood:2015:TDS, author = "Ahmed R. Mahmood and Ahmed M. Aly and Thamir Qadah and El Kindi Rezig and Anas Daghistani and Amgad Madkour and Ahmed S. Abdelhamid and Mohamed S. Hassan and Walid G. Aref and Saleh Basalamah", title = "{Tornado}: a distributed spatio-textual stream processing system", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2020--2023", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824126", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The widespread use of location-aware devices together with the increased popularity of micro-blogging applications (e.g., Twitter) led to the creation of large streams of spatio-textual data. In order to serve real-time applications, the processing of these large-scale spatio-textual streams needs to be distributed. However, existing distributed stream processing systems (e.g., Spark and Storm) are not optimized for spatial/textual content. In this demonstration, we introduce Tornado, a distributed in-memory spatio-textual stream processing server that extends Storm. To efficiently process spatio-textual streams, Tornado introduces a spatio-textual indexing layer to the architecture of Storm. The indexing layer is adaptive, i.e., dynamically re-distributes the processing across the system according to changes in the data distribution and/or query workload. In addition to keywords, higher-level textual concepts are identified and are semantically matched against spatio-textual queries. Tornado provides data deduplication and fusion to eliminate redundant textual data. We demonstrate a prototype of Tornado running against real Twitter streams, where the users can register continuous or snapshot spatio-textual queries using a map-assisted query-interface.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Crotty:2015:VIA, author = "Andrew Crotty and Alex Galakatos and Emanuel Zgraggen and Carsten Binnig and Tim Kraska", title = "{Vizdom}: interactive analytics through pen and touch", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2024--2027", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824127", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Machine learning (ML) and advanced statistics are important tools for drawing insights from large datasets. However, these techniques often require human intervention to steer computation towards meaningful results. In this demo, we present V izdom, a new system for interactive analytics through pen and touch. Vizdom's frontend allows users to visually compose complex workflows of ML and statistics operators on an interactive whiteboard, and the back-end leverages recent advances in workflow compilation techniques to run these computations at interactive speeds. Additionally, we are exploring approximation techniques for quickly visualizing partial results that incrementally refine over time. This demo will show Vizdom's capabilities by allowing users to interactively build complex analytics workflows using real-world datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Consens:2015:SCE, author = "Mariano P. Consens and Valeria Fionda and Shahan Khatchadourian and Giuseppe Pirr{\`o}", title = "{S+EPPs}: construct and explore bisimulation summaries, plus optimize navigational queries; all on existing {SPARQL} systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2028--2031", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824128", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate S+EPPs, a system that provides fast construction of bisimulation summaries using graph analytics platforms, and then enhances existing SPARQL engines to support summary-based exploration and navigational query optimization. The construction component adds a novel optimization to a parallel bisimulation algorithm implemented on a multi-core graph processing framework. We show that for several large, disk resident, real world graphs, full summary construction can be completed in roughly the same time as the data load. The query translation component supports Extended Property Paths (EPPs), an enhancement of SPARQL 1.1 property paths that can express a significantly larger class of navigational queries. EPPs are implemented via rewritings into a widely used SPARQL subset. The optimization component can (transparently to users) translate EPPs defined on instance graphs into EPPs that take advantage of bisimulation summaries. S+EPPs combines the query and optimization translations to enable summary-based optimization of graph traversal queries on top of off-the-shelf SPARQL processors. The demonstration showcases the construction of bisimulation summaries of graphs (ranging from millions to billions of edges), together with the exploration benefits and the navigational query speedups obtained by leveraging summaries stored alongside the original datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xirogiannopoulos:2015:GEI, author = "Konstantinos Xirogiannopoulos and Udayan Khurana and Amol Deshpande", title = "{GraphGen}: exploring interesting graphs in relational data", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2032--2035", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824129", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Analyzing interconnection structures among the data through the use of graph algorithms and graph analytics has been shown to provide tremendous value in many application domains. However, graphs are not the primary choice for how most data is currently stored, and users who want to employ graph analytics are forced to extract data from their data stores, construct the requisite graphs, and then use a specialized engine to write and execute their graph analysis tasks. This cumbersome and costly process not only raises barriers in using graph analytics, but also makes it hard to explore and identify hidden or implicit graphs in the data. Here we demonstrate a system, called G raphGen, that enables users to declaratively specify graph extraction tasks over relational databases, visually explore the extracted graphs, and write and execute graph algorithms over them, either directly or using existing graph libraries like the widely used NetworkX Python library. We also demonstrate how unifying the extraction tasks and the graph algorithms enables significant optimizations that would not be possible otherwise.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yoon:2015:DPF, author = "Dong Young Yoon and Barzan Mozafari and Douglas P. Brown", title = "{DBSeer}: pain-free database administration through workload intelligence", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2036--2039", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824130", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The pressing need for achieving and maintaining high performance in database systems has made database administration one of the most stressful jobs in information technology. On the other hand, the increasing complexity of database systems has made qualified database administrators (DBAs) a scarce resource. DBAs are now responsible for an array of demanding tasks; they need to (i) provision and tune their database according to their application requirements, (ii) constantly monitor their database for any performance failures or slowdowns, (iii) diagnose the root cause of the performance problem in an accurate and timely fashion, and (iv) take prompt actions that can restore acceptable database performance. However, much of the research in the past years has focused on improving the raw performance of the database systems, rather than improving their manageability. Besides sophisticated consoles for monitoring performance and a few auto-tuning wizards, DBAs are not provided with any help other than their own many years of experience. Typically, their only resort is trial-and-error, which is a tedious, ad-hoc and often sub-optimal solution. In this demonstration, we present DBSeer, a workload intelligence framework that exploits advanced machine learning and causality techniques to aid DBAs in their various responsibilities. DBSeer analyzes large volumes of statistics and telemetry data collected from various log files to provide the DBA with a suite of rich functionalities including performance prediction, performance diagnosis, bottleneck explanation, workload insight, optimal admission control, and what-if analysis. In this demo, we showcase various features of DBSeer by predicting and analyzing the performance of a live database system. Will also reproduce a number of realistic performance problems in the system, and allow the audience to use DBSeer to quickly diagnose and resolve their root cause.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kejariwal:2015:RTA, author = "Arun Kejariwal and Sanjeev Kulkarni and Karthik Ramasamy", title = "Real time analytics: algorithms and systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2040--2041", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824132", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "V elocity is one of the 4 Vs commonly used to characterize Big Data [5]. In this regard, Forrester remarked the following in Q3 2014 [8]: ``The high velocity, white-water flow of data from innumerable real-time data sources such as market data, Internet of Things, mobile, sensors, click-stream, and even transactions remain largely unnavigated by most firms. The opportunity to leverage streaming analytics has never been greater.'' Example use cases of streaming analytics include, but not limited to: (a) visualization of business metrics in real-time (b) facilitating highly personalized experiences (c) expediting response during emergencies. Streaming analytics is extensively used in a wide variety of domains such as healthcare, e-commerce, financial services, telecommunications, energy and utilities, manufacturing, government and transportation. In this tutorial, we shall present an in-depth overview of streaming analytics --- applications, algorithms and platforms --- landscape. We shall walk through how the field has evolved over the last decade and then discuss the current challenges --- the impact of the other three V s, viz., V olume, V ariety and V eracity, on Big Data streaming analytics. The tutorial is intended for both researchers and practitioners in the industry. We shall also present state-of-the-affairs of streaming analytics at Twitter.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Khan:2015:UGM, author = "Arijit Khan and Lei Chen", title = "On uncertain graphs modeling and queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2042--2043", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824133", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large-scale, highly-interconnected networks pervade both our society and the natural world around us. Uncertainty, on the other hand, is inherent in the underlying data due to a variety of reasons, such as noisy measurements, lack of precise information needs, inference and prediction models, or explicit manipulation, e.g., for privacy purposes. Therefore, uncertain, or probabilistic, graphs are increasingly used to represent noisy linked data in many emerging application scenarios, and they have recently become a hot topic in the database research community. While many classical graph algorithms such as reachability and shortest path queries become \# P -complete, and hence, more expensive in uncertain graphs; various complex queries are also emerging over uncertain networks, such as pattern matching, information diffusion, and influence maximization queries. In this tutorial, we discuss the sources of uncertain graphs and their applications, uncertainty modeling, as well as the complexities and algorithmic advances on uncertain graphs processing in the context of both classical and emerging graph queries. We emphasize the current challenges and highlight some future research directions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dong:2015:TMI, author = "Xin Luna Dong and Wang-Chiew Tan", title = "A time machine for information: looking back to look forward", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2044--2045", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824134", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the abundant availability of information one can mine from the Web today, there is increasing interest to develop a complete understanding of the history of an entity (i.e., a person, a company, a music genre, a country, etc.) (see, for example, [7, 9, 10, 11]) and to depict trends over time [5, 12, 13]. This, however, remains a largely difficult and manual task despite more than a couple of decades of research in the areas of temporal databases and data integration.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Das:2015:SAS, author = "Mahashweta Das and Gautam Das", title = "Structured analytics in social media", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2046--2047", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824135", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The rise of social media has turned the Web into an online community where people connect, communicate, and collaborate with each other. Structured analytics in social media is the process of discovering the structure of the relationships emerging from this social media use. It focuses on identifying the users involved, the activities they undertake, the actions they perform, and the items (e.g., movies, restaurants, blogs, etc.) they create and interact with. There are two key challenges facing these tasks: how to organize and model social media content, which is often unstructured in its raw form, in order to employ structured analytics on it; and how to employ analytics algorithms to capture both explicit link-based relationships and implicit behavior-based relationships. In this tutorial, we systemize and summarize the research so far in analyzing social interactions between users and items in the Web from data mining and database perspectives. We start with a general overview of the topic, including discourse to various exciting and practical applications. Then, we discuss the state-of-art for modeling the data, formalizing the mining task, developing the algorithmic solutions, and evaluating on real datasets. We also emphasize open problems and challenges for future research in the area of structured analytics and social media.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gao:2015:TDC, author = "Jing Gao and Qi Li and Bo Zhao and Wei Fan and Jiawei Han", title = "Truth discovery and crowdsourcing aggregation: a unified perspective", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2048--2049", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824136", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the era of Big Data, data entries, even describing the same objects or events, can come from a variety of sources, where a data source can be a web page, a database or a person. Consequently, conflicts among sources become inevitable. To resolve the conflicts and achieve high quality data, truth discovery and crowdsourcing aggregation have been studied intensively. However, although these two topics have a lot in common, they are studied separately and are applied to different domains. To answer the need of a systematic introduction and comparison of the two topics, we present an organized picture on truth discovery and crowdsourcing aggregation in this tutorial. They are compared on both theory and application levels, and their related areas as well as open questions are discussed.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abadi:2015:SHS, author = "Daniel Abadi and Shivnath Babu and Fatma {\"O}zcan and Ippokratis Pandis", title = "{SQL-on-Hadoop} systems: tutorial", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2050--2051", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824137", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Enterprises are increasingly using Apache Hadoop, more specifically HDFS, as a central repository for all their data; data coming from various sources, including operational systems, social media and the web, sensors and smart devices, as well as their applications. At the same time many enterprise data management tools (e.g. from SAP ERP and SAS to Tableau) rely on SQL and many enterprise users are familiar and comfortable with SQL. As a result, SQL processing over Hadoop data has gained significant traction over the recent years, and the number of systems that provide such capability has increased significantly. In this tutorial we use the term SQL-on-Hadoop to refer to systems that provide some level of declarative SQL(-like) processing over HDFS and noSQL data sources, using architectures that include computational or storage engines compatible with Apache Hadoop.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Loaiza:2015:EDH, author = "Juan Loaiza", title = "Engineering database hardware and software together", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2052--2052", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824139", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Since its inception, Oracle's database software primarily ran on customer configured off-the-shelf hardware. A decade ago, the architecture of conventional systems started to become a bottleneck and Oracle developed the Oracle Exadata Database Machine to optimize the full hardware and software stack for database workloads. Exadata is based on a scale-out architecture of database servers and storage servers that optimizes both OLTP and Analytic workloads while hosting hundreds of databases simultaneously on the same system. By using database specific protocols for storage and networking we bypass limitations imposed by conventional network and storage layers. Exadata is now deployed at thousands of Enterprises including 4 of the 5 largest banks, telecoms, and retailers for varied workloads such as interbank funds transfers, e-commerce, ERP, Cloud SaaS applications, and petabyte data warehouses. Five years ago, Oracle initiated a project to extend our database stack beyond software and systems and into the architecture of the microprocessor itself. The goal of this effort is to dramatically improve the performance, reliability and cost effectiveness of a new generation of database machines. The new SPARC M7 processor is the first step. The M7 is an extraordinarily fast conventional processor with 32-cores per socket and an extremely high bandwidth memory system. Added to its conventional processing capabilities are 32 custom on-chip database co-processors that run database searches at full memory bandwidth rates, and decompress data in real-time to increase memory bandwidth and capacity. Further, the M7 implements innovative fine-grained memory protection to secure sensitive business data. In the presentation we will describe how Oracle's engineering teams integrate software and hardware at all levels to achieve breakthrough performance, reliability, and security for the database and rest of the modern data processing stack.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Balazinska:2015:BDR, author = "Magdalena Balazinska", title = "Big data research: will industry solve all the problems?", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2053--2056", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824140", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The need for effective tools for big data data management and analytics continues to grow. While the ecosystem of tools is expanding many research problems remain open: they include challenges around efficient processing, flexible analytics, ease of use, and operation as a service. Many new systems and much innovation, however, come from industry (or from academic projects that quickly became big players in industry). An important question for our community is whether industry will solve all the problems or whether there is a place for academic research in big data and what is that place. In this paper, we address this question by looking back at our research on the Nuage, CQMS, Myria, and Data Pricing projects, and the SciDB collaboration.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Walter:2015:BPB, author = "Todd Walter", title = "Big plateaus of {Big Data} on the big island", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2057--2057", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824141", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In ancient texts, 40 was a magic number. It meant a ``lot'' or ``a long time.'' 40 years represented the time it took for a new generation to arise. A look back at 40 years of VLDB suggests that this applies to database researchers as well --- the young researchers of the early VLDBs are now the old folks of the database world, and a new generation is creating afresh. Over this period many plateaus of ``Big Data'' have challenged the database community and been conquered. But there is still no free lunch --- database research is really the science of trade-offs, many of which are no different today than 40 years ago. And of course the evolution of hardware technology continues to swing the trade-off pendulum while enabling new plateaus to be reached. Todd will take a look back at customer big data plateaus of the past. He will look at where we are today, then use his crystal ball and the lessons of the past to extrapolate the next several plateaus --- how they will be the same and how will they be different. Along the way we will have a little fun with some VLDB and Teradata history.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ailamaki:2015:DHB, author = "Anastasia Ailamaki", title = "Databases and hardware: the beginning and sequel of a beautiful friendship", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "12", pages = "2058--2061", month = aug, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2824032.2824142", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 16 18:23:11 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Fast query and transaction processing is the goal of 40 years of database research and the reason of existence for many new database system architectures. In data management, system performance means acceptable response time and throughput on critical-path operations, ideally with scalability guarantees. Performance is improved with top-of-the line research on data processing algorithms; efficiency, however, is contingent on seamless collaboration between the database software and hardware and storage devices. In 1980, the goal was to minimize disk accesses; in 2000, memory replaced disks in terms of access costs. Nowadays performance is synonymous to scalability; scalability, in turn, translates into sustainable and predictable use of hardware resources in the face of embarrassing parallelism and deep storage hierarchies while minimizing energy needs --- a challenging goal in multiple dimensions. We discuss work done in the past four decades to tighten the interaction between the database software and underlying hardware and show that, as application and microarchitecture roadmaps evolve, the effort of maintaining smooth collaboration blossoms into a multitude of interesting research avenues with direct technological impact.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Aly:2015:AAQ, author = "Ahmed M. Aly and Ahmed R. Mahmood and Mohamed S. Hassan and Walid G. Aref and Mourad Ouzzani and Hazem Elmeleegy and Thamir Qadah", title = "{AQWA}: adaptive query workload aware partitioning of big spatial data", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "13", pages = "2062--2073", month = sep, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2831360.2831361", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 30 17:17:35 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The unprecedented spread of location-aware devices has resulted in a plethora of location-based services in which huge amounts of spatial data need to be efficiently processed by large-scale computing clusters. Existing cluster-based systems for processing spatial data employ static data-partitioning structures that cannot adapt to data changes, and that are insensitive to the query workload. Hence, these systems are incapable of consistently providing good performance. To close this gap, we present AQWA, an adaptive and query-workload-aware mechanism for partitioning large-scale spatial data. AQWA does not assume prior knowledge of the data distribution or the query workload. Instead, as data is consumed and queries are processed, the data partitions are incrementally updated. With extensive experiments using real spatial data from Twitter, and various workloads of range and k -nearest-neighbor queries, we demonstrate that AQWA can achieve an order of magnitude enhancement in query performance compared to the state-of-the-art systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Khayyat:2015:LFS, author = "Zuhair Khayyat and William Lucia and Meghna Singh and Mourad Ouzzani and Paolo Papotti and Jorge-Arnulfo Quian{\'e}-Ruiz and Nan Tang and Panos Kalnis", title = "Lightning fast and space efficient inequality joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "13", pages = "2074--2085", month = sep, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2831360.2831362", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 30 17:17:35 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", note = "See erratum \cite{Khayyat:2017:ELF}.", abstract = "Inequality joins, which join relational tables on inequality conditions, are used in various applications. While there have been a wide range of optimization methods for joins in database systems, from algorithms such as sort-merge join and band join, to various indices such as B$^+$ -tree, R$^*$ -tree and Bitmap, inequality joins have received little attention and queries containing such joins are usually very slow. In this paper, we introduce fast inequality join algorithms. We put columns to be joined in sorted arrays and we use permutation arrays to encode positions of tuples in one sorted array w.r.t. the other sorted array. In contrast to sort-merge join, we use space efficient bit-arrays that enable optimizations, such as Bloom filter indices, for fast computation of the join results. We have implemented a centralized version of these algorithms on top of PostgreSQL, and a distributed version on top of Spark SQL. We have compared against well known optimization techniques for inequality joins and show that our solution is more scalable and several orders of magnitude faster.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2015:FPO, author = "Jinfei Liu and Li Xiong and Jian Pei and Jun Luo and Haoyu Zhang", title = "Finding {Pareto} optimal groups: group-based skyline", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "13", pages = "2086--2097", month = sep, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2831360.2831363", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 30 17:17:35 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Skyline computation, aiming at identifying a set of skyline points that are not dominated by any other point, is particularly useful for multi-criteria data analysis and decision making. Traditional skyline computation, however, is inadequate to answer queries that need to analyze not only individual points but also groups of points. To address this gap, we generalize the original skyline definition to the novel group-based skyline (G-Skyline), which represents Pareto optimal groups that are not dominated by other groups. In order to compute G-Skyline groups consisting of k points efficiently, we present a novel structure that represents the points in a directed skyline graph and captures the dominance relationships among the points based on the first k skyline layers. We propose efficient algorithms to compute the first k skyline layers. We then present two heuristic algorithms to efficiently compute the G-Skyline groups: the point-wise algorithm and the unit group-wise algorithm, using various pruning strategies. The experimental results on the real NBA dataset and the synthetic datasets show that G-Skyline is interesting and useful, and our algorithms are efficient and scalable.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Faulkner:2015:RQN, author = "Taylor Kessler Faulkner and Will Brackenbury and Ashwin Lall", title = "$k$-regret queries with nonlinear utilities", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "13", pages = "2098--2109", month = sep, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2831360.2831364", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 30 17:17:35 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In exploring representative databases, a primary issue has been finding accurate models of user preferences. Given this, our work generalizes the method of regret minimization as proposed by Nanongkai et al. to include nonlinear utility functions. Regret minimization is an approach for selecting k representative points from a database such that every user's ideal point in the entire database is similar to one of the k points. This approach combines benefits of the methods top- k and skyline; it controls the size of the output but does not require knowledge of users' preferences. Prior work with k -regret queries assumes users' preferences to be modeled by linear utility functions. In this paper, we derive upper and lower bounds for nonlinear utility functions, as these functions can better fit occurrences such as diminishing marginal returns, propensity for risk, and substitutability of preferences. To model these phenomena, we analyze a broad subset of convex, concave, and constant elasticity of substitution functions. We also run simulations on real and synthetic data to prove the efficacy of our bounds in practice.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shi:2015:CTM, author = "Juwei Shi and Yunjie Qiu and Umar Farooq Minhas and Limei Jiao and Chen Wang and Berthold Reinwald and Fatma {\"O}zcan", title = "Clash of the titans: {MapReduce} vs. {Spark} for large scale data analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "13", pages = "2110--2121", month = sep, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2831360.2831365", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 30 17:17:35 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "MapReduce and Spark are two very popular open source cluster computing frameworks for large scale data analytics. These frameworks hide the complexity of task parallelism and fault-tolerance, by exposing a simple programming API to users. In this paper, we evaluate the major architectural components in MapReduce and Spark frameworks including: shuffle, execution model, and caching, by using a set of important analytic workloads. To conduct a detailed analysis, we developed two profiling tools: (1) We correlate the task execution plan with the resource utilization for both MapReduce and Spark, and visually present this correlation; (2) We provide a break-down of the task execution time for in-depth analysis. Through detailed experiments, we quantify the performance differences between MapReduce and Spark. Furthermore, we attribute these performance differences to different components which are architected differently in the two frameworks. We further expose the source of these performance differences by using a set of micro-benchmark experiments. Overall, our experiments show that Spark is about 2.5x, 5x, and 5x faster than MapReduce, for Word Count, k-means, and PageRank, respectively. The main causes of these speedups are the efficiency of the hash-based aggregation component for combine, as well as reduced CPU and disk overheads due to RDD caching in Spark. An exception to this is the Sort workload, for which MapReduce is 2x faster than Spark. We show that MapReduce's execution model is more efficient for shuffling data than Spark, thus making Sort run faster on MapReduce.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2015:TMI, author = "Yu Liu and Jiaheng Lu and Hua Yang and Xiaokui Xiao and Zhewei Wei", title = "Towards maximum independent sets on massive graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "13", pages = "2122--2133", month = sep, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2831360.2831366", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 30 17:17:35 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Maximum independent set (MIS) is a fundamental problem in graph theory and it has important applications in many areas such as social network analysis, graphical information systems and coding theory. The problem is NP-hard, and there has been numerous studies on its approximate solutions. While successful to a certain degree, the existing methods require memory space at least linear in the size of the input graph. This has become a serious concern in view of the massive volume of today's fast-growing graphs. In this paper, we study the MIS problem under the semi-external setting, which assumes that the main memory can accommodate all vertices of the graph but not all edges. We present a greedy algorithm and a general vertex-swap framework, which swaps vertices to incrementally increase the size of independent sets. Our solutions require only few sequential scans of graphs on the disk file, thus enabling in-memory computation without costly random disk accesses. Experiments on large-scale datasets show that our solutions are able to compute a large independent set for a massive graph with 59 million vertices and 151 million edges using a commodity machine, with a memory cost of 469MB and a time cost of three minutes, while yielding an approximation ratio that is around 99\% of the theoretical optimum.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Meehan:2015:SSM, author = "John Meehan and Nesime Tatbul and Stan Zdonik and Cansu Aslantas and Ugur Cetintemel and Jiang Du and Tim Kraska and Samuel Madden and David Maier and Andrew Pavlo and Michael Stonebraker and Kristin Tufte and Hao Wang", title = "{S-Store}: streaming meets transaction processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "13", pages = "2134--2145", month = sep, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2831360.2831367", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 30 17:17:35 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Stream processing addresses the needs of real-time applications. Transaction processing addresses the coordination and safety of short atomic computations. Heretofore, these two modes of operation existed in separate, stove-piped systems. In this work, we attempt to fuse the two computational paradigms in a single system called S-Store. In this way, S-Store can simultaneously accommodate OLTP and streaming applications. We present a simple transaction model for streams that integrates seamlessly with a traditional OLTP system, and provides both ACID and stream-oriented guarantees. We chose to build S-Store as an extension of H-Store --- an open-source, in-memory, distributed OLTP database system. By implementing S-Store in this way, we can make use of the transaction processing facilities that H-Store already provides, and we can concentrate on the additional features that are needed to support streaming. Similar implementations could be done using other main-memory OLTP platforms. We show that we can actually achieve higher throughput for streaming workloads in S-Store than an equivalent deployment in H-Store alone. We also show how this can be achieved within H-Store with the addition of a modest amount of new functionality. Furthermore, we compare S-Store to two state-of-the-art streaming systems, Esper and Apache Storm, and show how S-Store can sometimes exceed their performance while at the same time providing stronger correctness guarantees.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Levandoski:2015:MVR, author = "Justin Levandoski and David Lomet and Sudipta Sengupta and Ryan Stutsman and Rui Wang", title = "Multi-version range concurrency control in {Deuteronomy}", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "13", pages = "2146--2157", month = sep, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2831360.2831368", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 30 17:17:35 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Deuteronomy transactional key value store executes millions of serializable transactions/second by exploiting multi-version timestamp order concurrency control. However, it has not supported range operations, only individual record operations (e.g., create, read, update, delete). In this paper, we enhance our multi-version timestamp order technique to handle range concurrency and prevent phantoms. Importantly, we maintain high performance while respecting the clean separation of duties required by Deuteronomy, where a transaction component performs purely logical concurrency control (including range support), while a data component performs data storage and management duties. Like the rest of the Deuteronomy stack, our range technique manages concurrency information in a latch-free manner. With our range enhancement, Deuteronomy can reach scan speeds of nearly 250 million records/s (more than 27 GB/s) on modern hardware, while providing serializable isolation complete with phantom prevention.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2015:QEI, author = "Hao Li and Chee-Yong Chan and David Maier", title = "Query from examples: an iterative, data-driven approach to query construction", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "13", pages = "2158--2169", month = sep, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2831360.2831369", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 30 17:17:35 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we propose a new approach, called Query from Examples (QFE), to help non-expert database users construct SQL queries. Our approach, which is designed for users who might be unfamiliar with SQL, only requires that the user is able to determine whether a given output table is the result of his or her intended query on a given input database. To kick-start the construction of a target query Q, the user first provides a pair of inputs: a sample database D and an output table R which is the result of Q on D. As there will be many candidate queries that transform D to R, QFE winnows this collection by presenting the user with new database-result pairs that distinguish these candidates. Unlike previous approaches that use synthetic data for such pairs, QFE strives to make these distinguishing pairs as close to the original ( D,R) pair as possible. By doing so, it seeks to minimize the effort needed by a user to determine if a new database-result pair is consistent with his or her desired query. We demonstrate the effectiveness and efficiency of our approach using real datasets from SQLShare, a cloud-based platform designed to help scientists utilize RDBMS technology for data analysis.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Galhotra:2015:TCR, author = "Sainyam Galhotra and Amitabha Bagchi and Srikanta Bedathur and Maya Ramanath and Vidit Jain", title = "Tracking the conductance of rapidly evolving topic-subgraphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "13", pages = "2170--2181", month = sep, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2831360.2831370", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 30 17:17:35 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Monitoring the formation and evolution of communities in large online social networks such as Twitter is an important problem that has generated considerable interest in both industry and academia. Fundamentally, the problem can be cast as studying evolving sugraphs (each subgraph corresponding to a topical community) on an underlying social graph --- with users as nodes and the connection between them as edges. A key metric of interest in this setting is tracking the changes to the conductance of subgraphs induced by edge activations. This metric quantifies how well or poorly connected a subgraph is to the rest of the graph relative to its internal connections. Conductance has been demonstrated to be of great use in many applications, such as identifying bursty topics, tracking the spread of rumors, and so on. However, tracking this simple metric presents a considerable scalability challenge --- the underlying social network is large, the number of communities that are active at any moment is large, the rate at which these communities evolve is high, and moreover, we need to track conductance in real-time. We address these challenges in this paper. We propose an in-memory approximation called BloomGraphs to store and update these (possibly overlapping) evolving subgraphs. As the name suggests, we use Bloom filters to represent an approximation of the underlying graph. This representation is compact and computationally efficient to maintain in the presence of updates. This is especially important when we need to simultaneously maintain thousands of evolving subgraphs. BloomGraphs are used in computing and tracking conductance of these subgraphs as edge-activations arrive. BloomGraphs have several desirable properties in the context of this application, including a small memory footprint and efficient updateability. We also demonstrate mathematically that the error incurred in computing conductance is one-sided and that in the case of evolving subgraphs the change in approximate conductance has the same sign as the change in exact conductance in most cases. We validate the effectiveness of BloomGraphs through extensive experimentation on large Twitter graphs and other social networks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Vartak:2015:SED, author = "Manasi Vartak and Sajjadur Rahman and Samuel Madden and Aditya Parameswaran and Neoklis Polyzotis", title = "{SeeDB}: efficient data-driven visualization recommendations to support visual analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "13", pages = "2182--2193", month = sep, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2831360.2831371", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 30 17:17:35 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data analysts often build visualizations as the first step in their analytical workflow. However, when working with high-dimensional datasets, identifying visualizations that show relevant or desired trends in data can be laborious. We propose S eeDB, a visualization recommendation engine to facilitate fast visual analysis: given a subset of data to be studied, SeeDB intelligently explores the space of visualizations, evaluates promising visualizations for trends, and recommends those it deems most ``useful'' or ``interesting''. The two major obstacles in recommending interesting visualizations are (a) scale: evaluating a large number of candidate visualizations while responding within interactive time scales, and (b) utility: identifying an appropriate metric for assessing interestingness of visualizations. For the former, SeeDB introduces pruning optimizations to quickly identify high-utility visualizations and sharing optimizations to maximize sharing of computation across visualizations. For the latter, as a first step, we adopt a deviation-based metric for visualization utility, while indicating how we may be able to generalize it to other factors influencing utility. We implement SeeDB as a middleware layer that can run on top of any DBMS. Our experiments show that our framework can identify interesting visualizations with high accuracy. Our optimizations lead to multiple orders of magnitude speedup on relational row and column stores and provide recommendations at interactive time scales. Finally, we demonstrate via a user study the effectiveness of our deviation-based utility metric and the value of recommendations in supporting visual analytics.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qiu:2015:DLS, author = "Disheng Qiu and Luciano Barbosa and Xin Luna Dong and Yanyan Shen and Divesh Srivastava", title = "{Dexter}: large-scale discovery and extraction of product specifications on the web", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "13", pages = "2194--2205", month = sep, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2831360.2831372", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Sep 30 17:17:35 MDT 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The web is a rich resource of structured data. There has been an increasing interest in using web structured data for many applications such as data integration, web search and question answering. In this paper, we present Dexter, a system to find product sites on the web, and detect and extract product specifications from them. Since product specifications exist in multiple product sites, our focused crawler relies on search queries and backlinks to discover product sites. To perform the detection, and handle the high diversity of specifications in terms of content, size and format, our system uses supervised learning to classify HTML fragments (e.g., tables and lists) present in web pages as specifications or not. To perform large-scale extraction of the attribute-value pairs from the HTML fragments identified by the specification detector, Dexter adopts two lightweight strategies: a domain-independent and unsupervised wrapper method, which relies on the observation that these HTML fragments have very similar structure; and a combination of this strategy with a previous approach, which infers extraction patterns by annotations generated by automatic but noisy annotators. The results show that our crawler strategy to locate product specification pages is effective: (1) it discovered 1:46AM product specification pages from 3,005 sites and 9 different categories; (2) the specification detector obtains high values of F-measure (close to 0:9) over a heterogeneous set of product specifications; and (3) our efficient wrapper methods for attribute-value extraction get very high values of precision (0.92) and recall (0.95) and obtain better results than a state-of-the-art, supervised rule-based wrapper.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2015:QAL, author = "Qiang Huang and Jianlin Feng and Yikai Zhang and Qiong Fang and Wilfred Ng", title = "Query-aware locality-sensitive hashing for approximate nearest neighbor search", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "1", pages = "1--12", month = sep, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Dec 19 17:42:24 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Locality-Sensitive Hashing (LSH) and its variants are the well-known indexing schemes for the $c$-Approximate Nearest Neighbor (c -ANN) search problem in high-dimensional Euclidean space. Traditionally, LSH functions are constructed in a query-oblivious manner in the sense that buckets are partitioned before any query arrives. However, objects closer to a query may be partitioned into different buckets, which is undesirable. Due to the use of query-oblivious bucket partition, the state-of-the-art LSH schemes for external memory, namely C2LSH and LSB-Forest, only work with approximation ratio of integer $ c \geq 2$. In this paper, we introduce a novel concept of query-aware bucket partition which uses a given query as the ``anchor'' for bucket partition. Accordingly, a query-aware LSH function is a random projection coupled with query-aware bucket partition, which removes random shift required by traditional query-oblivious LSH functions. Notably, query-aware bucket partition can be easily implemented so that query performance is guaranteed. We propose a novel query-aware LSH scheme named QALSH for $c$-ANN search over external memory. Our theoretical studies show that QALSH enjoys a guarantee on query quality. The use of query-aware LSH function enables QALSH to work with any approximation ratio $ c > 1$. Extensive experiments show that QALSH outperforms C2LSH and LSB-Forest, especially in high-dimensional space. Specifically, by using a ratio $ c < 2$, QALSH can achieve much better query quality.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Khaouid:2015:KCD, author = "Wissam Khaouid and Marina Barsky and Venkatesh Srinivasan and Alex Thomo", title = "{$K$}-core decomposition of large networks on a single {PC}", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "1", pages = "13--23", month = sep, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Dec 19 17:42:24 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Studying the topology of a network is critical to inferring underlying dynamics such as tolerance to failure, group behavior and spreading patterns. $k$-core decomposition is a well-established metric which partitions a graph into layers from external to more central vertices. In this paper we aim to explore whether $k$-core decomposition of large networks can be computed using a consumer-grade PC. We feature implementations of the ``vertex-centric'' distributed protocol introduced by Montresor, De Pellegrini and Miorandi on GraphChi and Webgraph. Also, we present an accurate implementation of the Batagelj and Zaversnik algorithm for $k$-core decomposition in Webgraph. With our implementations, we show that we can efficiently handle networks of billions of edges using a single consumer-level machine within reasonable time and can produce excellent approximations in only a fraction of the execution time. To the best of our knowledge, our biggest graphs are considerably larger than the graphs considered in the literature. Next, we present an optimized implementation of an external-memory algorithm (EMcore) by Cheng, Ke, Chu, and {\"O}zsu. We show that this algorithm also performs well for large datasets, however, it cannot predict whether a given memory budget is sufficient for a new dataset. We present a thorough analysis of all algorithms concluding that it is viable to compute $k$-core decomposition for large networks in a consumer-grade PC.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2015:WCP, author = "Zhenguo Li and Yixiang Fang and Qin Liu and Jiefeng Cheng and Reynold Cheng and John C. S. Lui", title = "Walking in the cloud: parallel {SimRank} at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "1", pages = "24--35", month = sep, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Dec 19 17:42:24 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/pagerank.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Despite its popularity, SimRank is computationally costly, in both time and space. In particular, its recursive nature poses a great challenge in using modern distributed computing power, and also prevents querying similarities individually. Existing solutions suffer greatly from these practical issues. In this paper, we break such dependency for maximum efficiency possible. Our method consists of offline and online phases. In offline phase, a length- n indexing vector is derived by solving a linear system in parallel. At online query time, the similarities are computed instantly from the index vector. Throughout, the Monte Carlo method is used to maximally reduce time and space. Our algorithm, called CloudWalker, is highly parallelizable, with only linear time and space. Remarkably, it responses to both single-pair and single-source queries in constant time. CloudWalker is orders of magnitude more efficient and scalable than existing solutions for large-scale problems. Implemented on Spark with 10 machines and tested on the web-scale clue-web graph with 1 billion nodes and 43 billion edges, it takes 110 hours for offline indexing, 64 seconds for a single-pair query, and 188 seconds for a single-source query. To the best of our knowledge, our work is the first to report results on clue-web, which is 10x larger than the largest graph ever reported for SimRank computation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Arocena:2015:MBE, author = "Patricia C. Arocena and Boris Glavic and Giansalvatore Mecca and Ren{\'e}e J. Miller and Paolo Papotti and Donatello Santoro", title = "Messing up with {BART}: error generation for evaluating data-cleaning algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "2", pages = "36--47", month = oct, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 2 14:26:50 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the problem of introducing errors into clean databases for the purpose of benchmarking data-cleaning algorithms. Our goal is to provide users with the highest possible level of control over the error-generation process, and at the same time develop solutions that scale to large databases. We show in the paper that the error-generation problem is surprisingly challenging, and in fact, NP-complete. To provide a scalable solution, we develop a correct and efficient greedy algorithm that sacrifices completeness, but succeeds under very reasonable assumptions. To scale to millions of tuples, the algorithm relies on several non-trivial optimizations, including a new symmetry property of data quality constraints. The trade-off between control and scalability is the main technical contribution of the paper.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hayashi:2015:FDB, author = "Takanori Hayashi and Takuya Akiba and Yuichi Yoshida", title = "Fully dynamic betweenness centrality maintenance on massive networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "2", pages = "48--59", month = oct, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 2 14:26:50 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Measuring the relative importance of each vertex in a network is one of the most fundamental building blocks in network analysis. Among several importance measures, betweenness centrality, in particular, plays key roles in many real applications. Considerable effort has been made for developing algorithms for static settings. However, real networks today are highly dynamic and are evolving rapidly, and scalable dynamic methods that can instantly reflect graph changes into centrality values are required. In this paper, we present the first fully dynamic method for managing betweenness centrality of all vertices in a large dynamic network. Its main data structure is the weighted hyperedge representation of shortest paths called hypergraph sketch. We carefully design dynamic update procedure with theoretical accuracy guarantee. To accelerate updates, we further propose two auxiliary data structures called two-ball index and special-purpose reachability index. Experimental results using real networks demonstrate its high scalability and efficiency. In particular, it can reflect a graph change in less than a millisecond on average for a large-scale web graph with 106M vertices and 3.7B edges, which is several orders of magnitude larger than the limits of previous dynamic methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lu:2015:CCC, author = "Wei Lu and Wei Chen and Laks V. S. Lakshmanan", title = "From competition to complementarity: comparative influence diffusion and maximization", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "2", pages = "60--71", month = oct, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 2 14:26:50 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Influence maximization is a well-studied problem that asks for a small set of influential users from a social network, such that by targeting them as early adopters, the expected total adoption through influence cascades over the network is maximized. However, almost all prior work focuses on cascades of a single propagating entity or purely-competitive entities. In this work, we propose the Comparative Independent Cascade (Com-IC) model that covers the full spectrum of entity interactions from competition to complementarity. In Com-IC, users' adoption decisions depend not only on edge-level information propagation, but also on a node-level automaton whose behavior is governed by a set of model parameters, enabling our model to capture not only competition, but also complementarity, to any possible degree. We study two natural optimization problems, Self Influence Maximization and Complementary Influence Maximization, in a novel setting with complementary entities. Both problems are NP-hard, and we devise efficient and effective approximation algorithms via non-trivial techniques based on reverse-reachable sets and a novel ``sandwich approximation'' strategy. The applicability of both techniques extends beyond our model and problems. Our experiments show that the proposed algorithms consistently outperform intuitive baselines on four real-world social networks, often by a significant margin. In addition, we learn model parameters from real user action logs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kloudas:2015:POD, author = "Konstantinos Kloudas and Margarida Mamede and Nuno Pregui{\c{c}}a and Rodrigo Rodrigues", title = "{Pixida}: optimizing data parallel jobs in wide-area data analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "2", pages = "72--83", month = oct, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 2 14:26:50 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the era of global-scale services, big data analytical queries are often required to process datasets that span multiple data centers (DCs). In this setting, cross-DC bandwidth is often the scarcest, most volatile, and/or most expensive resource. However, current widely deployed big data analytics frameworks make no attempt to minimize the traffic traversing these links. In this paper, we present P ixida, a scheduler that aims to minimize data movement across resource constrained links. To achieve this, we introduce a new abstraction called Silo, which is key to modeling Pixida's scheduling goals as a graph partitioning problem. Furthermore, we show that existing graph partitioning problem formulations do not map to how big data jobs work, causing their solutions to miss opportunities for avoiding data movement. To address this, we formulate a new graph partitioning problem and propose a novel algorithm to solve it. We integrated Pixida in Spark and our experiments show that, when compared to existing schedulers, Pixida achieves a significant traffic reduction of up to $ \approx 9 \times $ on the aforementioned links.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2015:SOS, author = "Lu Wang and Robert Christensen and Feifei Li and Ke Yi", title = "Spatial online sampling and aggregation", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "3", pages = "84--95", month = nov, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 2 14:26:50 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The massive adoption of smart phones and other mobile devices has generated humongous amount of spatial and spatio-temporal data. The importance of spatial analytics and aggregation is ever-increasing. An important challenge is to support interactive exploration over such data. However, spatial analytics and aggregation using all data points that satisfy a query condition is expensive, especially over large data sets, and could not meet the needs of interactive exploration. To that end, we present novel indexing structures that support spatial online sampling and aggregation on large spatial and spatio-temporal data sets. In spatial online sampling, random samples from the set of spatial (or spatio-temporal) points that satisfy a query condition are generated incrementally in an online fashion. With more and more samples, various spatial analytics and aggregations can be performed in an online, interactive fashion, with estimators that have better accuracy over time. Our design works well for both memory-based and disk-resident data sets, and scales well towards different query and sample sizes. More importantly, our structures are dynamic, hence, they are able to deal with insertions and deletions efficiently. Extensive experiments on large real data sets demonstrate the improvements achieved by our indexing structures compared to other baseline methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Richter:2015:SDA, author = "Stefan Richter and Victor Alvarez and Jens Dittrich", title = "A seven-dimensional analysis of hashing methods and its implications on query processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "3", pages = "96--107", month = nov, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 2 14:26:50 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Hashing is a solved problem. It allows us to get constant time access for lookups. Hashing is also simple. It is safe to use an arbitrary method as a black box and expect good performance, and optimizations to hashing can only improve it by a negligible delta. Why are all of the previous statements plain wrong? That is what this paper is about. In this paper we thoroughly study hashing for integer keys and carefully analyze the most common hashing methods in a five-dimensional requirements space: (1) data-distribution, (2) load factor, (3) dataset size, (4) read/write-ratio, and (5) un/successful-ratio. Each point in that design space may potentially suggest a different hashing scheme, and additionally also a different hash function. We show that a right or wrong decision in picking the right hashing scheme and hash function combination may lead to significant difference in performance. To substantiate this claim, we carefully analyze two additional dimensions: (6) five representative hashing schemes (which includes an improved variant of Robin Hood hashing), (7) four important classes of hash functions widely used today. That is, we consider 20 different combinations in total. Finally, we also provide a glimpse about the effect of table memory layout and the use of SIMD instructions. Our study clearly indicates that picking the right combination may have considerable impact on insert and lookup performance, as well as memory footprint. A major conclusion of our work is that hashing should be considered a white box before blindly using it in applications, such as query processing. Finally, we also provide a strong guideline about when to use which hashing method.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Arocena:2015:IIM, author = "Patricia C. Arocena and Boris Glavic and Radu Ciucanu and Ren{\'e}e J. Miller", title = "The {iBench} integration metadata generator", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "3", pages = "108--119", month = nov, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 2 14:26:50 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given the maturity of the data integration field it is surprising that rigorous empirical evaluations of research ideas are so scarce. We identify a major roadblock for empirical work --- the lack of comprehensive metadata generators that can be used to create benchmarks for different integration tasks. This makes it difficult to compare integration solutions, understand their generality, and understand their performance. We present iBench, the first metadata generator that can be used to evaluate a wide-range of integration tasks (data exchange, mapping creation, mapping composition, schema evolution, among many others). iBench permits control over the size and characteristics of the metadata it generates (schemas, constraints, and mappings). Our evaluation demonstrates that iBench can efficiently generate very large, complex, yet realistic scenarios with different characteristics. We also present an evaluation of three mapping creation systems using iBench and show that the intricate control that iBench provides over metadata scenarios can reveal new and important empirical insights. iBench is an open-source, extensible tool that we are providing to the community. We believe it will raise the bar for empirical evaluation and comparison of data integration systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Altwaijry:2015:QFI, author = "Hotham Altwaijry and Sharad Mehrotra and Dmitri V. Kalashnikov", title = "{QuERy}: a framework for integrating entity resolution with query processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "3", pages = "120--131", month = nov, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 2 14:26:50 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper explores an analysis-aware data cleaning architecture for a large class of SPJ SQL queries. In particular, we propose QuERy, a novel framework for integrating entity resolution (ER) with query processing. The aim of QuERy is to correctly and efficiently answer complex queries issued on top of dirty data. The comprehensive empirical evaluation of the proposed solution demonstrates its significant advantage in terms of efficiency over the traditional techniques for the given problem settings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lee:2015:POM, author = "Taesung Lee and Jin-woo Park and Sanghoon Lee and Seung-Won Hwang and Sameh Elnikety and Yuxiong He", title = "Processing and optimizing main memory spatial-keyword queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "3", pages = "132--143", month = nov, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 2 14:26:50 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Important cloud services rely on spatial-keyword queries, containing a spatial predicate and arbitrary boolean keyword queries. In particular, we study the processing of such queries in main memory to support short response times. In contrast, current state-of-the-art spatial-keyword indexes and relational engines are designed for different assumptions. Rather than building a new spatial-keyword index, we employ a cost-based optimizer to process these queries using a spatial index and a keyword index. We address several technical challenges to achieve this goal. We introduce three operators as the building blocks to construct plans for main memory query processing. We then develop a cost model for the operators and query plans. We introduce five optimization techniques that efficiently reduce the search space and produce a query plan with low cost. The optimization techniques are computationally efficient, and they identify a query plan with a formal approximation guarantee under the common independence assumption. Furthermore, we extend the framework to exploit interesting orders. We implement the query optimizer to empirically validate our proposed approach using real-life datasets. The evaluation shows that the optimizations provide significant reduction in the average and tail latency of query processing: 7- to 11-fold reduction over using a single index in terms of 99th percentile response time. In addition, this approach outperforms existing spatial-keyword indexes, and DBMS query optimizers for both average and high-percentile response times.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Park:2015:NSH, author = "Yongjoo Park and Michael Cafarella and Barzan Mozafari", title = "Neighbor-sensitive hashing", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "3", pages = "144--155", month = nov, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 2 14:26:50 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Approximate $k$ NN ($k$-nearest neighbor) techniques using binary hash functions are among the most commonly used approaches for overcoming the prohibitive cost of performing exact $k$ NN queries. However, the success of these techniques largely depends on their hash functions' ability to distinguish $k$ NN items; that is, the $k$ NN items retrieved based on data items' hashcodes, should include as many true $k$ NN items as possible. A widely-adopted principle for this process is to ensure that similar items are assigned to the same hashcode so that the items with the hashcodes similar to a query's hashcode are likely to be true neighbors. In this work, we abandon this heavily-utilized principle and pursue the opposite direction for generating more effective hash functions for $k$ NN tasks. That is, we aim to increase the distance between similar items in the hashcode space, instead of reducing it. Our contribution begins by providing theoretical analysis on why this revolutionary and seemingly counter-intuitive approach leads to a more accurate identification of $k$ NN items. Our analysis is followed by a proposal for a hashing algorithm that embeds this novel principle. Our empirical studies confirm that a hashing algorithm based on this counter-intuitive idea significantly improves the efficiency and accuracy of state-of-the-art techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2015:CMB, author = "Botong Huang and Nicholas W. D. Jarrett and Shivnath Babu and Sayan Mukherjee and Jun Yang", title = "{C{\"u}m{\"u}l{\"o}n}: matrix-based data analytics in the cloud with spot instances", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "3", pages = "156--167", month = nov, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 2 14:26:50 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We describe C{\"u}m{\"u}l{\"o}n, a system aimed at helping users develop and deploy matrix-based data analysis programs in a public cloud. A key feature of C{\"u}m{\"u}l{\"o}n is its end-to-end support for the so-called spot instances ---machines whose market price fluctuates over time but is usually much lower than the regular fixed price. A user sets a bid price when acquiring spot instances, and loses them as soon as the market price exceeds the bid price. While spot instances can potentially save cost, they are difficult to use effectively, and run the risk of not finishing work while costing more. C{\"u}m{\"u}l{\"o}n provides a highly elastic computation and storage engine on top of spot instances, and offers automatic cost-based optimization of execution, deployment, and bidding strategies. C{\"u}m{\"u}l{\"o}n further quantifies how the uncertainty in the market price translates into the cost uncertainty of its recommendations, and allows users to specify their risk tolerance as an optimization constraint.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kaul:2015:NLU, author = "Manohar Kaul and Raymond Chi-Wing Wong and Christian S. Jensen", title = "New lower and upper bounds for shortest distance queries on terrains", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "3", pages = "168--179", month = nov, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 2 14:26:50 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The increasing availability of massive and accurate laser data enables the processing of spatial queries on terrains. As shortest-path computation, an integral element of query processing, is inherently expensive on terrains, a key approach to enabling efficient query processing is to reduce the need for exact shortest-path computation in query processing. We develop new lower and upper bounds on terrain shortest distances that are provably tighter than any existing bounds. Unlike existing bounds, the new bounds do not rely on the quality of the triangulation. We show how use of the new bounds speeds up query processing by reducing the need for exact distance computations. Speedups of of nearly an order of magnitude are demonstrated empirically for well-known spatial queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Freire:2015:CRR, author = "Cibele Freire and Wolfgang Gatterbauer and Neil Immerman and Alexandra Meliou", title = "The complexity of resilience and responsibility for self-join-free conjunctive queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "3", pages = "180--191", month = nov, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 2 14:26:50 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Several research thrusts in the area of data management have focused on understanding how changes in the data affect the output of a view or standing query. Example applications are explaining query results, propagating updates through views, and anonymizing datasets. An important aspect of this analysis is the problem of deleting a minimum number of tuples from the input tables to make a given Boolean query false, which we refer to as `` the resilience of a query. '' In this paper, we study the complexity of resilience for self-join-free conjunctive queries with arbitrary functional dependencies. The cornerstone of our work is the novel concept of triads, a simple structural property of a query that leads to the several dichotomy results we show in this paper. The concepts of triads and resilience bridge the connections between the problems of deletion propagation and causal responsibility, and allow us to substantially advance the known complexity results in these topics. Specifically, we show a dichotomy for the complexity of resilience, which identifies previously unknown tractable families for deletion propagation with source side-effects, and we extend this result to account for functional dependencies. Further, we identify a mistake in a previous dichotomy for causal responsibility, and offer a revised characterization based purely on the structural form of the query (presence or absence of triads). Finally, we extend the dichotomy for causal responsibility in two ways: (a) we account for functional dependencies in the input tables, and (b) we compute responsibility for sets of tuples specified via wildcards.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2015:SAD, author = "Hao Huang and Shiva Prasad Kasiviswanathan", title = "Streaming anomaly detection using randomized matrix sketching", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "3", pages = "192--203", month = nov, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 2 14:26:50 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data is continuously being generated from sources such as machines, network traffic, application logs, etc. Timely and accurate detection of anomalies in massive data streams has important applications such as in preventing machine failures, intrusion detection, and dynamic load balancing. In this paper, we introduce a novel (unsupervised) anomaly detection framework which can be used to detect anomalies in a streaming fashion by making only one pass over the data while utilizing limited storage. We adapt ideas from matrix sketching to maintain, in a streaming model, a set of few orthogonal vectors that form a good approximate basis for all the observed data. Using this constructed orthogonal basis, anomalies in new incoming data are detected based on a simple reconstruction error test. We theoretically prove that our algorithm compares favorably with an offline approach based on expensive global singular value decomposition (SVD) updates. Additionally, we apply ideas from randomized low-rank matrix approximations to further speedup the algorithm. The experimental results show the effectiveness and efficiency of our approach over other popular scalable anomaly detection approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Leis:2015:HGQ, author = "Viktor Leis and Andrey Gubichev and Atanas Mirchev and Peter Boncz and Alfons Kemper and Thomas Neumann", title = "How good are query optimizers, really?", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "3", pages = "204--215", month = nov, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 2 14:26:50 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Finding a good join order is crucial for query performance. In this paper, we introduce the Join Order Benchmark (JOB) and experimentally revisit the main components in the classic query optimizer architecture using a complex, real-world data set and realistic multi-join queries. We investigate the quality of industrial-strength cardinality estimators and find that all estimators routinely produce large errors. We further show that while estimates are essential for finding a good join order, query performance is unsatisfactory if the query engine relies too heavily on these estimates. Using another set of experiments that measure the impact of the cost model, we find that it has much less influence on query performance than the cardinality estimates. Finally, we investigate plan enumeration techniques comparing exhaustive dynamic programming with heuristic algorithms and find that exhaustive enumeration improves performance despite the sub-optimal cardinality estimates.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Interlandi:2015:TDP, author = "Matteo Interlandi and Kshitij Shah and Sai Deep Tetali and Muhammad Ali Gulzar and Seunghyun Yoo and Miryung Kim and Todd Millstein and Tyson Condie", title = "{Titian}: data provenance support in {Spark}", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "3", pages = "216--227", month = nov, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 2 14:26:50 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Debugging data processing logic in Data-Intensive Scalable Computing (DISC) systems is a difficult and time consuming effort. Today's DISC systems offer very little tooling for debugging programs, and as a result programmers spend countless hours collecting evidence ( e.g., from log files) and performing trial and error debugging. To aid this effort, we built Titian, a library that enables data provenance ---tracking data through transformations---in Apache Spark. Data scientists using the Titian Spark extension will be able to quickly identify the input data at the root cause of a potential bug or outlier result. Titian is built directly into the Spark platform and offers data provenance support at interactive speeds---orders-of-magnitude faster than alternative solutions---while minimally impacting Spark job performance; observed overheads for capturing data lineage rarely exceed 30\% above the baseline job execution time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rodiger:2015:HSQ, author = "Wolf R{\"o}diger and Tobias M{\"u}hlbauer and Alfons Kemper and Thomas Neumann", title = "High-speed query processing over high-speed networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "4", pages = "228--239", month = dec, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Dec 19 17:42:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern database clusters entail two levels of networks: connecting CPUs and NUMA regions inside a single server in the small and multiple servers in the large. The huge performance gap between these two types of networks used to slow down distributed query processing to such an extent that a cluster of machines actually performed worse than a single many-core server. The increased main-memory capacity of the cluster remained the sole benefit of such a scale-out. The economic viability of high-speed interconnects such as InfiniBand has narrowed this performance gap considerably. However, InfiniBand's higher network bandwidth alone does not improve query performance as expected when the distributed query engine is left unchanged. The scalability of distributed query processing is impaired by TCP overheads, switch contention due to uncoordinated communication, and load imbalances resulting from the inflexibility of the classic exchange operator model. This paper presents the blueprint for a distributed query engine that addresses these problems by considering both levels of networks holistically. It consists of two parts: First, hybrid parallelism that distinguishes local and distributed parallelism for better scalability in both the number of cores as well as servers. Second, a novel communication multiplexer tailored for analytical database workloads using remote direct memory access (RDMA) and low-latency network scheduling for high-speed communication with almost no CPU overhead. An extensive evaluation within the HyPer database system using the TPC-H benchmark shows that our holistic approach indeed enables high-speed query processing over high-speed networks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zong:2015:BQD, author = "Bo Zong and Xusheng Xiao and Zhichun Li and Zhenyu Wu and Zhiyun Qian and Xifeng Yan and Ambuj K. Singh and Guofei Jiang", title = "Behavior query discovery in system-generated temporal graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "4", pages = "240--251", month = dec, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Dec 19 17:42:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Computer system monitoring generates huge amounts of logs that record the interaction of system entities. How to query such data to better understand system behaviors and identify potential system risks and malicious behaviors becomes a challenging task for system administrators due to the dynamics and heterogeneity of the data. System monitoring data are essentially heterogeneous temporal graphs with nodes being system entities and edges being their interactions over time. Given the complexity of such graphs, it becomes time-consuming for system administrators to manually formulate useful queries in order to examine abnormal activities, attacks, and vulnerabilities in computer systems. In this work, we investigate how to query temporal graphs and treat query formulation as a discriminative temporal graph pattern mining problem. We introduce TGMiner to mine discriminative patterns from system logs, and these patterns can be taken as templates for building more complex queries. TGMiner leverages temporal information in graphs to prune graph patterns that share similar growth trend without compromising pattern quality. Experimental results on real system data show that TGMiner is 6-32 times faster than baseline methods. The discovered patterns were verified by system experts; they achieved high precision (97\%) and recall (91\%).", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kocberber:2015:AMA, author = "Onur Kocberber and Babak Falsafi and Boris Grot", title = "Asynchronous memory access chaining", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "4", pages = "252--263", month = dec, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Dec 19 17:42:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In-memory databases rely on pointer-intensive data structures to quickly locate data in memory. A single lookup operation in such data structures often exhibits long-latency memory stalls due to dependent pointer dereferences. Hiding the memory latency by launching additional memory accesses for other lookups is an effective way of improving performance of pointer-chasing codes (e.g., hash table probes, tree traversals). The ability to exploit such inter-lookup parallelism is beyond the reach of modern out-of-order cores due to the limited size of their instruction window. Instead, recent work has proposed software prefetching techniques that exploit inter-lookup parallelism by arranging a set of independent lookups into a group or a pipeline, and navigate their respective pointer chains in a synchronized fashion. While these techniques work well for highly regular access patterns, they break down in the face of irregularity across lookups. Such irregularity includes variable-length pointer chains, early exit, and read/write dependencies. This work introduces Asynchronous Memory Access Chaining (AMAC), a new approach for exploiting inter-lookup parallelism to hide the memory access latency. AMAC achieves high dynamism in dealing with irregularity across lookups by maintaining the state of each lookup separately from that of other lookups. This feature enables AMAC to initiate a new lookup as soon as any of the in-flight lookups complete. In contrast, the static arrangement of lookups into a group or pipeline in existing techniques precludes such adaptivity. Our results show that AMAC matches or outperforms state-of-the-art prefetching techniques on regular access patterns, while delivering up to 2.3x higher performance under irregular data structure lookups. AMAC fully utilizes the available microarchitectural resources, generating the maximum number of memory accesses allowed by hardware in both single- and multi-threaded execution modes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Haney:2015:DPA, author = "Samuel Haney and Ashwin Machanavajjhala and Bolin Ding", title = "Design of policy-aware differentially private algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "4", pages = "264--275", month = dec, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Dec 19 17:42:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The problem of designing error optimal differentially private algorithms is well studied. Recent work applying differential privacy to real world settings have used variants of differential privacy that appropriately modify the notion of neighboring databases. The problem of designing error optimal algorithms for such variants of differential privacy is open. In this paper, we show a novel transformational equivalence result that can turn the problem of query answering under differential privacy with a modified notion of neighbors to one of query answering under standard differential privacy, for a large class of neighbor definitions. We utilize the Blowfish privacy framework that generalizes differential privacy. Blowfish uses a policy graph to instantiate different notions of neighboring databases. We show that the error incurred when answering a workload W on a database x under a Blowfish policy graph G is identical to the error required to answer a transformed workload f$_G$ (W) on database g$_G$ (x) under standard differential privacy, where f$_G$ and g$_G$ are linear transformations based on G. Using this result, we develop error efficient algorithms for releasing histograms and multidimensional range queries under different Blowfish policies. We believe the tools we develop will be useful for finding mechanisms to answer many other classes of queries with low error under other policy graphs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2015:ACC, author = "Xin Huang and Laks V. S. Lakshmanan and Jeffrey Xu Yu and Hong Cheng", title = "Approximate closest community search in networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "4", pages = "276--287", month = dec, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Dec 19 17:42:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently, there has been significant interest in the study of the community search problem in social and information networks: given one or more query nodes, find densely connected communities containing the query nodes. However, most existing studies do not address the ``free rider'' issue, that is, nodes far away from query nodes and irrelevant to them are included in the detected community. Some state-of-the-art models have attempted to address this issue, but not only are their formulated problems NP-hard, they do not admit any approximations without restrictive assumptions, which may not always hold in practice. In this paper, given an undirected graph G and a set of query nodes Q, we study community search using the k -truss based community model. We formulate our problem of finding a closest truss community (CTC), as finding a connected k truss subgraph with the largest k that contains Q, and has the minimum diameter among such subgraphs. We prove this problem is NP-hard. Furthermore, it is NP-hard to approximate the problem within a factor $ (2 - \epsilon) $, for any $ \epsilon > 0 $. However, we develop a greedy algorithmic framework, which first finds a CTC containing Q, and then iteratively removes the furthest nodes from Q, from the graph. The method achieves 2-approximation to the optimal solution. To further improve the efficiency, we make use of a compact truss index and develop efficient algorithms for k -truss identification and maintenance as nodes get eliminated. In addition, using bulk deletion optimization and local exploration strategies, we propose two more efficient algorithms. One of them trades some approximation quality for efficiency while the other is a very efficient heuristic. Extensive experiments on 6 real-world networks show the effectiveness and efficiency of our community model and search algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Andre:2015:CLE, author = "Fabien Andr{\'e} and Anne-Marie Kermarrec and Nicolas {Le Scouarnec}", title = "Cache locality is not enough: high-performance nearest neighbor search with product quantization fast scan", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "4", pages = "288--299", month = dec, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Dec 19 17:42:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Nearest Neighbor (NN) search in high dimension is an important feature in many applications (e.g., image retrieval, multimedia databases). Product Quantization (PQ) is a widely used solution which offers high performance, i.e., low response time while preserving a high accuracy. PQ represents high-dimensional vectors (e.g., image descriptors) by compact codes. Hence, very large databases can be stored in memory, allowing NN queries without resorting to slow I/O operations. PQ computes distances to neighbors using cache-resident lookup tables, thus its performance remains limited by (i) the many cache accesses that the algorithm requires, and (ii) its inability to leverage SIMD instructions available on modern CPUs. In this paper, we advocate that cache locality is not sufficient for efficiency. To address these limitations, we design a novel algorithm, PQ Fast Scan, that transforms the cache-resident lookup tables into small tables, sized to fit SIMD registers. This transformation allows (i) in-register lookups in place of cache accesses and (ii) an efficient SIMD implementation. PQ Fast Scan has the exact same accuracy as PQ, while having 4 to 6 times lower response time (e.g., for 25 million vectors, scan time is reduced from 74ms to 13ms).", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Prokoshyna:2015:CQL, author = "Nataliya Prokoshyna and Jaros{\l}aw Szlichta and Fei Chiang and Ren{\'e}e J. Miller and Divesh Srivastava", title = "Combining quantitative and logical data cleaning", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "4", pages = "300--311", month = dec, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Dec 19 17:42:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Quantitative data cleaning relies on the use of statistical methods to identify and repair data quality problems while logical data cleaning tackles the same problems using various forms of logical reasoning over declarative dependencies. Each of these approaches has its strengths: the logical approach is able to capture subtle data quality problems using sophisticated dependencies, while the quantitative approach excels at ensuring that the repaired data has desired statistical properties. We propose a novel framework within which these two approaches can be used synergistically to combine their respective strengths. We instantiate our framework using (i) metric functional dependencies, a type of dependency that generalizes functional dependencies (FDs) to identify inconsistencies in domains where only large differences in metric data are considered to be a data quality problem, and (ii) repairs that modify the inconsistent data so as to minimize statistical distortion, measured using the Earth Mover's Distance. We show that the problem of computing a statistical distortion minimal repair is NP-hard. Given this complexity, we present an efficient algorithm for finding a minimal repair that has a small statistical distortion using EMD computation over semantically related attributes. To identify semantically related attributes, we present a sound and complete axiomatization and an efficient algorithm for testing implication of metric FDs. While the complexity of inference for some other FD extensions is co-NP complete, we show that the inference problem for metric FDs remains linear, as in traditional FDs. We prove that every instance that can be generated by our repair algorithm is set-minimal (with no unnecessary changes). Our experimental evaluation demonstrates that our techniques obtain a considerably lower statistical distortion than existing repair techniques, while achieving similar levels of efficiency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Papadakis:2015:SAV, author = "George Papadakis and George Alexiou and George Papastefanatos and Georgia Koutrika", title = "Schema-agnostic vs schema-based configurations for blocking methods on homogeneous data", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "4", pages = "312--323", month = dec, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Dec 19 17:42:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Entity Resolution constitutes a core task for data integration that, due to its quadratic complexity, typically scales to large datasets through blocking methods. These can be configured in two ways. The schema-based configuration relies on schema information in order to select signatures of high distinctiveness and low noise, while the schema-agnostic one treats every token from all attribute values as a signature. The latter approach has significant potential, as it requires no fine-tuning by human experts and it applies to heterogeneous data. Yet, there is no systematic study on its relative performance with respect to the schema-based configuration. This work covers this gap by comparing analytically the two configurations in terms of effectiveness, time efficiency and scalability. We apply them to 9 established blocking methods and to 11 benchmarks of structured data. We provide valuable insights into the internal functionality of the blocking methods with the help of a novel taxonomy. Our studies reveal that the schema-agnostic configuration offers unsupervised and robust definition of blocking keys under versatile settings, trading a higher computational cost for a consistently higher recall than the schema-based one. It also enables the use of state-of-the-art blocking methods without schema knowledge.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Epasto:2015:ENC, author = "Alessandro Epasto and Silvio Lattanzi and Vahab Mirrokni and Ismail Oner Sebe and Ahmed Taei and Sunita Verma", title = "Ego-net community mining applied to friend suggestion", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "4", pages = "324--335", month = dec, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Dec 19 17:42:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we present a study of the community structure of ego-networks---the graphs representing the connections among the neighbors of a node---for several online social networks. Toward this goal, we design a new technique to efficiently build and cluster all the ego-nets of a graph in parallel (note that even just building the ego-nets efficiently is challenging on large networks). Our experimental findings are quite compelling: at a microscopic level it is easy to detect high quality communities. Leveraging on this fact we, then, develop new features for friend suggestion based on co-occurrences of two nodes in different ego-nets' communities. Our new features can be computed efficiently on very large scale graphs by just analyzing the neighborhood of each node. Furthermore, we prove formally on a stylized model, and by experimental analysis that this new similarity measure outperforms the classic local features employed for friend suggestions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abedjan:2015:TRD, author = "Ziawasch Abedjan and Cuneyt G. Akcora and Mourad Ouzzani and Paolo Papotti and Michael Stonebraker", title = "Temporal rules discovery for web data cleaning", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "4", pages = "336--347", month = dec, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Dec 19 17:42:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Declarative rules, such as functional dependencies, are widely used for cleaning data. Several systems take them as input for detecting errors and computing a ``clean'' version of the data. To support domain experts, in specifying these rules, several tools have been proposed to profile the data and mine rules. However, existing discovery techniques have traditionally ignored the time dimension. Recurrent events, such as persons reported in locations, have a duration in which they are valid, and this duration should be part of the rules or the cleaning process would simply fail. In this work, we study the rule discovery problem for temporal web data. Such a discovery process is challenging because of the nature of web data; extracted facts are (i) sparse over time, (ii) reported with delays, and (iii) often reported with errors over the values because of inaccurate sources or non robust extractors. We handle these challenges with a new discovery approach that is more robust to noise. Our solution uses machine learning methods, such as association measures and outlier detection, for the discovery of the rules, together with an aggressive repair of the data in the mining step itself. Our experimental evaluation over real-world data from Recorded Future, an intelligence company that monitors over 700K Web sources, shows that temporal rules improve the quality of the data with an increase of the average precision in the cleaning process from 0.37 to 0.84, and a 40\% relative increase in the average F-measure.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Roy:2015:EQA, author = "Sudeepa Roy and Laurel Orr and Dan Suciu", title = "Explaining query answers with explanation-ready databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "4", pages = "348--359", month = dec, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Dec 19 17:42:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the increased generation and availability of big data in different domains, there is an imminent requirement for data analysis tools that are able to 'explain' the trends and anomalies obtained from this data to a range of users with different backgrounds. Wu-Madden (PVLDB 2013) and Roy-Suciu (SIGMOD 2014) recently proposed solutions that can explain interesting or unexpected answers to simple aggregate queries in terms of predicates on attributes. In this paper, we propose a generic framework that can support much richer, insightful explanations by preparing the database offline, so that top explanations can be found interactively at query time. The main idea in such explanation-ready databases is to pre-compute the effects of potential explanations (called interventions ), and efficiently re-evaluate the original query taking into account these effects. We formalize this notion and define an explanation-query that can evaluate all possible explanations simultaneously without having to run an iterative process, develop algorithms and optimizations, and evaluate our approach with experiments on real data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deng:2015:EPB, author = "Dong Deng and Guoliang Li and He Wen and Jianhua Feng", title = "An efficient partition based method for exact set similarity joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "4", pages = "360--371", month = dec, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Dec 19 17:42:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the exact set similarity join problem, which, given two collections of sets, finds out all the similar set pairs from the collections. Existing methods generally utilize the prefix filter based framework. They generate a prefix for each set and prune all the pairs whose prefixes are disjoint. However the pruning power is limited, because if two dissimilar sets share a common element in their prefixes, they cannot be pruned. To address this problem, we propose a partition-based framework. We design a partition scheme to partition the sets into several subsets and guarantee that two sets are similar only if they share a common subset. To improve the pruning power, we propose a mixture of the subsets and their 1-deletion neighborhoods (the subset of a set by eliminating one element). As there are multiple allocation strategies to generate the mixture, we evaluate different allocations and design a dynamic-programming algorithm to select the optimal one. However the time complexity of generating the optimal one is $ O(s^3) $ for a set with size $s$. To speed up the allocation selection, we develop a greedy algorithm with an approximation ratio of 2. To further reduce the complexity, we design an adaptive grouping mechanism, and the two techniques can reduce the complexity to $ O(s \log s)$. Experimental results on three real-world datasets show our method achieves high performance and outperforms state-of-the-art methods by 2-5 times.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Haas:2015:CSC, author = "Daniel Haas and Jiannan Wang and Eugene Wu and Michael J. Franklin", title = "{CLAMShell}: speeding up crowds for low-latency data labeling", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "4", pages = "372--383", month = dec, year = "2015", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Dec 19 17:42:25 MST 2015", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data labeling is a necessary but often slow process that impedes the development of interactive systems for modern data analysis. Despite rising demand for manual data labeling, there is a surprising lack of work addressing its high and unpredictable latency. In this paper, we introduce CLAMShell, a system that speeds up crowds in order to achieve consistently low-latency data labeling. We offer a taxonomy of the sources of labeling latency and study several large crowd-sourced labeling deployments to understand their empirical latency profiles. Driven by these insights, we comprehensively tackle each source of latency, both by developing novel techniques such as straggler mitigation and pool maintenance and by optimizing existing methods such as crowd retainer pools and active learning. We evaluate CLAMShell in simulation and on live workers on Amazon's Mechanical Turk, demonstrating that our techniques can provide an order of magnitude speedup and variance reduction over existing crowdsourced labeling strategies.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Firmani:2016:OER, author = "Donatella Firmani and Barna Saha and Divesh Srivastava", title = "Online entity resolution using an oracle", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "5", pages = "384--395", month = jan, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Jan 11 17:54:24 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Entity resolution (ER) is the task of identifying all records in a database that refer to the same underlying entity. This is an expensive task, and can take a significant amount of money and time; the end-user may want to take decisions during the process, rather than waiting for the task to be completed. We formalize an online version of the entity resolution task, and use an oracle which correctly labels matching and non-matching pairs through queries. In this setting, we design algorithms that seek to maximize progressive recall, and develop a novel analysis framework for prior proposals on entity resolution with an oracle, beyond their worst case guarantees. Finally, we provide both theoretical and experimental analysis of the proposed algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Calautti:2016:EEG, author = "Marco Calautti and Sergio Greco and Cristian Molinaro and Irina Trubitsyna", title = "Exploiting equality generating dependencies in checking chase termination", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "5", pages = "396--407", month = jan, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Jan 11 17:54:24 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The chase is a well-known algorithm with a wide range of applications in data exchange, data cleaning, data integration, query optimization, and ontological reasoning. Since the chase evaluation might not terminate and it is undecidable whether it terminates, the problem of defining (decidable) sufficient conditions ensuring termination has received a great deal of interest in recent years. In this regard, several termination criteria have been proposed. One of the main weaknesses of current approaches is the limited analysis they perform on equality generating dependencies (EGDs). In this paper, we propose sufficient conditions ensuring that a set of dependencies has at least one terminating chase sequence. We propose novel criteria which are able to perform a more accurate analysis of EGDs. Specifically, we propose a new stratification criterion and an adornment algorithm. The latter can both be used as a termination criterion and be combined with current techniques to make them more effective, in that strictly more sets of dependencies are identified. Our techniques identify sets of dependencies that are not recognized by any of the current criteria.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2016:SBF, author = "Tong Yang and Alex X. Liu and Muhammad Shahzad and Yuankun Zhong and Qiaobin Fu and Zi Li and Gaogang Xie and Xiaoming Li", title = "A shifting {Bloom} filter framework for set queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "5", pages = "408--419", month = jan, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Jan 11 17:54:24 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Set queries are fundamental operations in computer systems and applications. This paper addresses the fundamental problem of designing a probabilistic data structure that can quickly process set queries using a small amount of memory. We propose a Shifting Bloom Filter (ShBF) framework for representing and querying sets. We demonstrate the effectiveness of ShBF using three types of popular set queries: membership, association, and multiplicity queries. The key novelty of ShBF is on encoding the auxiliary information of a set element in a location offset. In contrast, prior BF based set data structures allocate additional memory to store auxiliary information. We conducted experiments using real-world network traces, and results show that ShBF significantly advances the state-of-the-art on all three types of set queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2016:HTM, author = "Fan Yang and Jinfeng Li and James Cheng", title = "{Husky}: towards a more efficient and expressive distributed computing framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "5", pages = "420--431", month = jan, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Jan 11 17:54:24 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Finding efficient, expressive and yet intuitive programming models for data-parallel computing system is an important and open problem. Systems like Hadoop and Spark have been widely adopted for massive data processing, as coarse-grained primitives like map and reduce are succinct and easy to master. However, sometimes over-simplified API hinders programmers from more fine-grained control and designing more efficient algorithms. Developers may have to resort to sophisticated domain-specific languages (DSLs), or even low-level layers like MPI, but this raises development cost---learning many mutually exclusive systems prolongs the development schedule, and the use of low-level tools may result in bug-prone programming. This motivated us to start the Husky open-source project, which is an attempt to strike a better balance between high performance and low development cost. Husky is developed mainly for in-memory large scale data mining, and also serves as a general research platform for designing efficient distributed algorithms. We show that many existing frameworks can be easily implemented and bridged together inside Husky, and Husky is able to achieve similar or even better performance compared with domain-specific systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2016:RDT, author = "Zeyu Li and Hongzhi Wang and Wei Shao and Jianzhong Li and Hong Gao", title = "Repairing data through regular expressions", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "5", pages = "432--443", month = jan, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Jan 11 17:54:24 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/string-matching.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Since regular expressions are often used to detect errors in sequences such as strings or date, it is natural to use them for data repair. Motivated by this, we propose a data repair method based on regular expression to make the input sequence data obey the given regular expression with minimal revision cost. The proposed method contains two steps, sequence repair and token value repair. For sequence repair, we propose the Regular-expression-based Structural Repair (RSR in short) algorithm. RSR algorithm is a dynamic programming algorithm that utilizes Nondeterministic Finite Automata (NFA) to calculate the edit distance between a prefix of the input string and a partial pattern regular expression with time complexity of $ O (n m^2) $ and space complexity of $ O(m n) $ where $m$ is the edge number of NFA and $n$ is the input string length. We also develop an optimization strategy to achieve higher performance for long strings. For token value repair, we combine the edit-distance-based method and associate rules by a unified argument for the selection of the proper method. Experimental results on both real and synthetic data show that the proposed method could repair the data effectively and efficiently.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yan:2016:LLC, author = "Cong Yan and Alvin Cheung", title = "Leveraging lock contention to improve {OLTP} application performance", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "5", pages = "444--455", month = jan, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Jan 11 17:54:24 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Locking is one of the predominant costs in transaction processing. While much work has focused on designing efficient concurrency control mechanisms, not much has been done on understanding how transaction applications issue queries and leveraging application semantics to improve application performance. This paper presents Q uro, a query-aware compiler that automatically reorders queries in transaction code to improve performance. Observing that certain queries within a transaction are more contentious than others as they require locking the same tuples as other concurrently executing transactions, Quro automatically changes the application such that contentious queries are issued as late as possible. We have evaluated Quro on various transaction benchmarks, and our results show that Quro-generated implementations can increase transaction throughput by up to 6.53x, while reduce transaction latency by up to 85\%.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Choudhury:2016:MBR, author = "Farhana M. Choudhury and J. Shane Culpepper and Timos Sellis and Xin Cao", title = "Maximizing bichromatic reverse spatial and textual $k$ nearest neighbor queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "6", pages = "456--467", month = jan, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 19 10:09:59 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The problem of maximizing bichromatic reverse $k$ nearest neighbor queries (BR $k$ NN) has been extensively studied in spatial databases. In this work, we present a related query for spatial-textual databases that finds an optimal location, and a set of keywords that maximizes the size of bichromatic reverse spatial textual $k$ nearest neighbors (MaxBRST $k$ NN). Such a query has many practical applications including social media advertisements where a limited number of relevant advertisements are displayed to each user. The problem is to find the location and the text contents to include in an advertisement so that it will be displayed to the maximum number of users. The increasing availability of spatial-textual collections allows us to answer these queries for both spatial proximity and textual similarity. This paper is the first to consider the MaxBRST $k$ NN query. We show that the problem is NP-hard and present both approximate and exact solutions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Subercaze:2016:IFM, author = "Julien Subercaze and Christophe Gravier and Jules Chevalier and Frederique Laforest", title = "{Inferray}: fast in-memory {RDF} inference", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "6", pages = "468--479", month = jan, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 19 10:09:59 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The advent of semantic data on the Web requires efficient reasoning systems to infer RDF and OWL data. The linked nature and the huge volume of data entail efficiency and scalability challenges when designing productive inference systems. This paper presents Inferray, an implementation of RDFS, $ \rho $ df, and RDFS-Plus inference with improved performance over existing solutions. The main features of Inferray are (1) a storage layout based on vertical partitioning that guarantees sequential access and efficient sort-merge join inference; (2) efficient sorting of pairs of 64-bit integers using ad-hoc optimizations on MSD radix and a custom counting sort; (3) a dedicated temporary storage to perform efficient graph closure computation. Our measurements on synthetic and real-world datasets show improvements over competitors on RDFS-Plus, and up to several orders of magnitude for transitivity closure.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Makreshanski:2016:MES, author = "Darko Makreshanski and Georgios Giannikis and Gustavo Alonso and Donald Kossmann", title = "{MQJoin}: efficient shared execution of main-memory joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "6", pages = "480--491", month = jan, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 19 10:09:59 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database architectures typically process queries one-at-a-time, executing concurrent queries in independent execution contexts. Often, such a design leads to unpredictable performance and poor scalability. One approach to circumvent the problem is to take advantage of sharing opportunities across concurrently running queries. In this paper we propose Many-Query Join (MQJoin), a novel method for sharing the execution of a join that can efficiently deal with hundreds of concurrent queries. This is achieved by minimizing redundant work and making efficient use of main-memory bandwidth and multi-core architectures. Compared to existing proposals, MQJoin is able to efficiently handle larger workloads regardless of the schema by exploiting more sharing opportunities. We also compared MQJoin to two commercial main-memory column-store databases. For a TPC-H based workload, we show that MQJoin provides 2--5x higher throughput with significantly more stable response times.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abeywickrama:2016:NNR, author = "Tenindra Abeywickrama and Muhammad Aamir Cheema and David Taniar", title = "$k$-nearest neighbors on road networks: a journey in experimentation and in-memory implementation", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "6", pages = "492--503", month = jan, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 19 10:09:59 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A $k$ nearest neighbor ($k$ NN) query on road networks retrieves the $k$ closest points of interest (POIs) by their network distances from a given location. Today, in the era of ubiquitous mobile computing, this is a highly pertinent query. While Euclidean distance has been used as a heuristic to search for the closest POIs by their road network distance, its efficacy has not been thoroughly investigated. The most recent methods have shown significant improvement in query performance. Earlier studies, which proposed disk-based indexes, were compared to the current state-of-the-art in main memory. However, recent studies have shown that main memory comparisons can be challenging and require careful adaptation. This paper presents an extensive experimental investigation in main memory to settle these and several other issues. We use efficient and fair memory-resident implementations of each method to reproduce past experiments and conduct additional comparisons for several overlooked evaluations. Notably we revisit a previously discarded technique (IER) showing that, through a simple improvement, it is often the best performing technique.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yuan:2016:BRF, author = "Yuan Yuan and Kaibo Wang and Rubao Lee and Xiaoning Ding and Jing Xing and Spyros Blanas and Xiaodong Zhang", title = "{BCC}: reducing false aborts in optimistic concurrency control with low cost for in-memory databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "6", pages = "504--515", month = jan, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 19 10:09:59 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Optimistic Concurrency Control (OCC) method has been commonly used for in-memory databases to ensure transaction serializability --- a transaction will be aborted if its read set has been changed during execution. This simple criterion to abort transactions causes a large proportion of false positives, leading to excessive transaction aborts. Transactions aborted false-positively (i.e. false aborts) waste system resources and can significantly degrade system throughput (as much as 3.68x based on our experiments) when data contention is intensive. Modern in-memory databases run on systems with increasingly parallel hardware and handle workloads with growing concurrency. They must efficiently deal with data contention in the presence of greater concurrency by minimizing false aborts. This paper presents a new concurrency control method named Balanced Concurrency Control (BCC) which aborts transactions more carefully than OCC does. BCC detects data dependency patterns which can more reliably indicate unserializable transactions than the criterion used in OCC. The paper studies the design options and implementation techniques that can effectively detect data contention by identifying dependency patterns with low overhead. To test the performance of BCC, we have implemented it in Silo and compared its performance against that of the vanilla Silo system with OCC and two-phase locking (2PL). Our extensive experiments with TPC-W-like, TPC-C-like and YCSB workloads demonstrate that when data contention is intensive, BCC can increase transaction throughput by more than 3x versus OCC and more than 2x versus 2PL; meanwhile, BCC has comparable performance with OCC for workloads with low data contention.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yuan:2016:EEG, author = "Long Yuan and Lu Qin and Xuemin Lin and Lijun Chang and Wenjie Zhang", title = "{I/O} efficient {ECC} graph decomposition via graph reduction", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "7", pages = "516--527", month = mar, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 19 10:10:00 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The problem of computing $k$-edge connected components ($k$-ECCs) of a graph G for a specific $k$ is a fundamental graph problem and has been investigated recently. In this paper, we study the problem of ECC decomposition, which computes the $k$-ECCs of a graph G for all $k$ values. ECC decomposition can be widely applied in a variety of applications such as graph-topology analysis, community detection, Steiner component search, and graph visualization. A straightforward solution for ECC decomposition is to apply the existing $k$-ECC computation algorithm to compute the $k$-ECCs for all $k$ values. However, this solution is not applicable to large graphs for two challenging reasons. First, all existing $k$-ECC computation algorithms are highly memory intensive due to the complex data structures used in the algorithms. Second, the number of possible $k$ values can be very large, resulting in a high computational cost when each $k$ value is independently considered. In this paper, we address the above challenges, and study I/O efficient ECC decomposition via graph reduction. We introduce two elegant graph reduction operators which aim to reduce the size of the graph loaded in memory while preserving the connectivity information of a certain set of edges to be computed for a specific k. We also propose three novel I/O efficient algorithms, Bottom-Up, Top-Down, and Hybrid, that explore the $k$ values in different orders to reduce the redundant computations between different $k$ values. We analyze the I/O and memory costs for all proposed algorithms. In our experiments, we evaluate our algorithms using seven real large datasets with various graph properties, one of which contains 1.95 billion edges. The experimental results show that our proposed algorithms are scalable and efficient.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Binnig:2016:ESN, author = "Carsten Binnig and Andrew Crotty and Alex Galakatos and Tim Kraska and Erfan Zamanian", title = "The end of slow networks: it's time for a redesign", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "7", pages = "528--539", month = mar, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 19 10:10:00 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The next generation of high-performance networks with remote direct memory access (RDMA) capabilities requires a fundamental rethinking of the design of distributed in-memory DBMSs. These systems are commonly built under the assumption that the network is the primary bottleneck and should be avoided at all costs, but this assumption no longer holds. For instance, with InfiniBand FDR $ 4 \times $, the bandwidth available to transfer data across the network is in the same ballpark as the bandwidth of one memory channel. Moreover, RDMA transfer latencies continue to rapidly improve as well. In this paper, we first argue that traditional distributed DBMS architectures cannot take full advantage of high-performance networks and suggest a new architecture to address this problem. Then, we discuss initial results from a prototype implementation of our proposed architecture for OLTP and OLAP, showing remarkable performance improvements over existing designs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2016:LLE, author = "Jiewen Huang and Daniel J. Abadi", title = "{Leopard}: lightweight edge-oriented partitioning and replication for dynamic graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "7", pages = "540--551", month = mar, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 19 10:10:00 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper introduces a dynamic graph partitioning algorithm, designed for large, constantly changing graphs. We propose a partitioning framework that adjusts on the fly as the graph structure changes. We also introduce a replication algorithm that is tightly integrated with the partitioning algorithm, which further reduces the number of edges cut by the partitioning algorithm. Even though the proposed approach is handicapped by only taking into consideration local parts of the graph when reassigning vertices, extensive evaluation shows that the proposed approach maintains a quality partitioning over time, which is comparable at any point in time to performing a full partitioning from scratch using a state-the-art static graph partitioning algorithm such as METIS. Furthermore, when vertex replication is turned on, edge-cut can improve by an order of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gribkoff:2016:SDP, author = "Eric Gribkoff and Dan Suciu", title = "{SlimShot}: in-database probabilistic inference for knowledge bases", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "7", pages = "552--563", month = mar, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 19 10:10:00 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Increasingly large Knowledge Bases are being created, by crawling the Web or other corpora of documents, and by extracting facts and relations using machine learning techniques. To manage the uncertainty in the data, these KBs rely on probabilistic engines based on Markov Logic Networks (MLN), for which probabilistic inference remains a major challenge. Today's state of the art systems use variants of MCMC, which have no theoretical error guarantees, and, as we show, suffer from poor performance in practice. In this paper we describe SlimShot (Scalable Lifted Inference and Monte Carlo Sampling Hybrid Optimization Technique), a probabilistic inference engine for knowledge bases. SlimShot converts the MLN to a tuple-independent probabilistic database, then uses a simple Monte Carlo-based inference, with three key enhancements: (1) it combines sampling with safe query evaluation, (2) it estimates a conditional probability by jointly computing the numerator and denominator, and (3) it adjusts the proposal distribution based on the sample cardinality. In combination, these three techniques allow us to give formal error guarantees, and we demonstrate empirically that SlimShot outperforms to-day's state of the art probabilistic inference engines used in knowledge bases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yan:2016:GPQ, author = "Da Yan and James Cheng and M. Tamer {\"O}zsu and Fan Yang and Yi Lu and John C. S. Lui and Qizhen Zhang and Wilfred Ng", title = "A general-purpose query-centric framework for querying big graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "7", pages = "564--575", month = mar, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 19 10:10:00 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Pioneered by Google's Pregel, many distributed systems have been developed for large-scale graph analytics. These systems employ a user-friendly ``think like a vertex'' programming model, and exhibit good scalability for tasks where the majority of graph vertices participate in computation. However, the design of these systems can seriously under-utilize the resources in a cluster for processing light-workload graph queries, where only a small fraction of vertices need to be accessed. In this work, we develop a new open-source system, called Quegel, for querying big graphs. Quegel treats queries as first-class citizens in its design: users only need to specify the Pregel-like algorithm for a generic query, and Quegel processes light-workload graph queries on demand, using a novel superstep-sharing execution model to effectively utilize the cluster resources. Quegel further provides a convenient interface for constructing graph indexes, which significantly improve query performance but are not supported by existing graph-parallel systems. Our experiments verified that Quegel is highly efficient in answering various types of graph queries and is up to orders of magnitude faster than existing systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Brucato:2016:SPQ, author = "Matteo Brucato and Juan Felipe Beltran and Azza Abouzied and Alexandra Meliou", title = "Scalable package queries in relational database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "7", pages = "576--587", month = mar, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 19 10:10:00 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Traditional database queries follow a simple model: they define constraints that each tuple in the result must satisfy. This model is computationally efficient, as the database system can evaluate the query conditions on each tuple individually. However, many practical, real-world problems require a collection of result tuples to satisfy constraints collectively, rather than individually. In this paper, we present package queries, a new query model that extends traditional database queries to handle complex constraints and preferences over answer sets. We develop a full-fledged package query system, implemented on top of a traditional database engine. Our work makes several contributions. First, we design PaQL, a SQL-based query language that supports the declarative specification of package queries. We prove that PaQL is at least as expressive as integer linear programming, and therefore, evaluation of package queries is in general NP-hard. Second, we present a fundamental evaluation strategy that combines the capabilities of databases and constraint optimization solvers to derive solutions to package queries. The core of our approach is a set of translation rules that transform a package query to an integer linear program. Third, we introduce an offline data partitioning strategy allowing query evaluation to scale to large data sizes. Fourth, we introduce SketchRefine, a scalable algorithm for package evaluation, with strong approximation guarantees ($ (1 \pm \epsilon)^6$-factor approximation). Finally, we present extensive experiments over real-world and benchmark data. The results demonstrate that SketchRefine is effective at deriving high-quality package results, and achieves runtime performance that is an order of magnitude faster than directly using ILP solvers over large datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2016:STK, author = "Xiang Wang and Ying Zhang and Wenjie Zhang and Xuemin Lin and Zengfeng Huang", title = "{Skype}: top-$k$ spatial-keyword publish\slash subscribe over sliding window", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "7", pages = "588--599", month = mar, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 19 10:10:00 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As the prevalence of social media and GPS-enabled devices, a massive amount of geo-textual data has been generated in a stream fashion, leading to a variety of applications such as location-based recommendation and information dissemination. In this paper, we investigate a novel real-time top-$k$ monitoring problem over sliding window of streaming data; that is, we continuously maintain the top-$k$ most relevant geo-textual messages (e.g., geo-tagged tweets) for a large number of spatial-keyword subscriptions (e.g., registered users interested in local events) simultaneously. To provide the most recent information under controllable memory cost, sliding window model is employed on the streaming geo-textual data. To the best of our knowledge, this is the first work to study top-$k$ spatial-keyword publish/subscribe over sliding window. A novel system, called Skype (Top-k Spatial-keyword Publish/Subscribe), is proposed in this paper. In Skype, to continuously maintain top-$k$ results for massive subscriptions, we devise a novel indexing structure upon subscriptions such that each incoming message can be immediately delivered on its arrival. Moreover, to reduce the expensive top-$k$ re-evaluation cost triggered by message expiration, we develop a novel cost-based $k$-skyband technique to reduce the number of re-evaluations in a cost-effective way. Extensive experiments verify the great efficiency and effectiveness of our proposed techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Asudeh:2016:DSW, author = "Abolfazl Asudeh and Saravanan Thirumuruganathan and Nan Zhang and Gautam Das", title = "Discovering the skyline of web databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "7", pages = "600--611", month = mar, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 19 10:10:00 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many web databases are ``hidden'' behind proprietary search interfaces that enforce the top-$k$ output constraint, i.e., each query returns at most $k$ of all matching tuples, preferentially selected and returned according to a proprietary ranking function. In this paper, we initiate research into the novel problem of skyline discovery over top-$k$ hidden web databases. Since skyline tuples provide critical insights into the database and include the top-ranked tuple for every possible ranking function following the monotonic order of attribute values, skyline discovery from a hidden web database can enable a wide variety of innovative third-party applications over one or multiple web databases. Our research in the paper shows that the critical factor affecting the cost of skyline discovery is the type of search interface controls provided by the website. As such, we develop efficient algorithms for three most popular types, i.e., one-ended range, free range and point predicates, and then combine them to support web databases that feature a mixture of these types. Rigorous theoretical analysis and extensive real-world online and offline experiments demonstrate the effectiveness of our proposed techniques and their superiority over baseline solutions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2016:CTK, author = "Xiaohang Zhang and Guoliang Li and Jianhua Feng", title = "Crowdsourced top-$k$ algorithms: an experimental evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "8", pages = "612--623", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2921558.2921559", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu May 26 16:07:35 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Crowdsourced top-$k$ computation has attracted significant attention recently, thanks to emerging crowdsourcing platforms, e.g., Amazon Mechanical Turk and CrowdFlower. Crowdsourced top-$k$ algorithms ask the crowd to compare the objects and infer the top-$k$ objects based on the crowdsourced comparison results. The crowd may return incorrect answers, but traditional top-$k$ algorithms cannot tolerate the errors from the crowd. To address this problem, the database and machine-learning communities have independently studied the crowdsourced top-$k$ problem. The database community proposes the heuristic-based solutions while the machine-learning community proposes the learning-based methods (e.g., maximum likelihood estimation). However, these two types of techniques have not been compared systematically under the same experimental framework. Thus it is rather difficult for a practitioner to decide which algorithm should be adopted. Furthermore, the experimental evaluation of existing studies has several weaknesses. Some methods assume the crowd returns high-quality results and some algorithms are only tested on simulated experiments. To alleviate these limitations, in this paper we present a comprehensive comparison of crowdsourced top-$k$ algorithms. Using various synthetic and real datasets, we evaluate each algorithm in terms of result quality and efficiency on real crowdsourcing platforms. We reveal the characteristics of different techniques and provide guidelines on selecting appropriate algorithms for various scenarios.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Maddox:2016:DRD, author = "Michael Maddox and David Goehring and Aaron J. Elmore and Samuel Madden and Aditya Parameswaran and Amol Deshpande", title = "{Decibel}: the relational dataset branching system", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "9", pages = "624--635", month = may, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu May 26 16:06:05 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As scientific endeavors and data analysis become increasingly collaborative, there is a need for data management systems that natively support the versioning or branching of datasets to enable concurrent analysis, cleaning, integration, manipulation, or curation of data across teams of individuals. Common practice for sharing and collaborating on datasets involves creating or storing multiple copies of the dataset, one for each stage of analysis, with no provenance information tracking the relationships between these datasets. This results not only in wasted storage, but also makes it challenging to track and integrate modifications made by different users to the same dataset. In this paper, we introduce the Relational Dataset Branching System, Decibel, a new relational storage system with built-in version control designed to address these short-comings. We present our initial design for Decibel and provide a thorough evaluation of three versioned storage engine designs that focus on efficient query processing with minimal storage overhead. We also develop an exhaustive benchmark to enable the rigorous testing of these and future versioned storage engine designs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mann:2016:EES, author = "Willi Mann and Nikolaus Augsten and Panagiotis Bouros", title = "An empirical evaluation of set similarity join techniques", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "9", pages = "636--647", month = may, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu May 26 16:06:05 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Set similarity joins compute all pairs of similar sets from two collections of sets. We conduct extensive experiments on seven state-of-the-art algorithms for set similarity joins. These algorithms adopt a filter-verification approach. Our analysis shows that verification has not received enough attention in previous works. In practice, efficient verification inspects only a small, constant number of set elements and is faster than some of the more sophisticated filter techniques. Although we can identify three winners, we find that most algorithms show very similar performance. The key technique is the prefix filter, and AllPairs, the first algorithm adopting this techniques is still a relevant competitor. We repeat experiments from previous work and discuss diverging results. All our claims are supported by a detailed analysis of the factors that determine the overall runtime.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Trummer:2016:MQO, author = "Immanuel Trummer and Christoph Koch", title = "Multiple query optimization on the {D-Wave 2X} adiabatic quantum computer", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "9", pages = "648--659", month = may, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu May 26 16:06:05 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The D-Wave adiabatic quantum annealer solves hard combinatorial optimization problems leveraging quantum physics. The newest version features over 1000 qubits and was released in August 2015. We were given access to such a machine, currently hosted at NASA Ames Research Center in California, to explore the potential for hard optimization problems that arise in the context of databases. In this paper, we tackle the problem of multiple query optimization (MQO). We show how an MQO problem instance can be transformed into a mathematical formula that complies with the restrictive input format accepted by the quantum annealer. This formula is translated into weights on and between qubits such that the configuration minimizing the input formula can be found via a process called adiabatic quantum annealing. We analyze the asymptotic growth rate of the number of required qubits in the MQO problem dimensions as the number of qubits is currently the main factor restricting applicability. We experimentally compare the performance of the quantum annealer against other MQO algorithms executed on a traditional computer. While the problem sizes that can be treated are currently limited, we already find a class of problem instances where the quantum annealer is three orders of magnitude faster than other approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Trummer:2016:PQO, author = "Immanuel Trummer and Christoph Koch", title = "Parallelizing query optimization on shared-nothing architectures", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "9", pages = "660--671", month = may, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu May 26 16:06:05 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data processing systems offer an ever increasing degree of parallelism on the levels of cores, CPUs, and processing nodes. Query optimization must exploit high degrees of parallelism in order not to gradually become the bottleneck of query evaluation. We show how to parallelize query optimization at a massive scale. We present algorithms for parallel query optimization in left-deep and bushy plan spaces. At optimization start, we divide the plan space for a given query into partitions of equal size that are explored in parallel by worker nodes. At the end of optimization, each worker returns the optimal plan in its partition to the master which determines the globally optimal plan from the partition-optimal plans. No synchronization or data exchange is required during the actual optimization phase. The amount of data sent over the network, at the start and at the end of optimization, as well as the complexity of serial steps within our algorithms increase only linearly in the number of workers and in the query size. The time and space complexity of optimization within one partition decreases uniformly in the number of workers. We parallelize single- and multi-objective query optimization over a cluster with 100 nodes in our experiments, using more than 250 concurrent worker threads (Spark executors). Despite high network latency and task assignment overheads, parallelization yields speedups of up to one order of magnitude for large queries whose optimization takes minutes on a single node.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kalavri:2016:SPA, author = "Vasiliki Kalavri and Tiago Simas and Dionysios Logothetis", title = "The shortest path is not always a straight line: leveraging semi-metricity in graph analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "9", pages = "672--683", month = may, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu May 26 16:06:05 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we leverage the concept of the metric backbone to improve the efficiency of large-scale graph analytics. The metric backbone is the minimum subgraph that preserves the shortest paths of a weighted graph. We use the metric backbone in place of the original graph to compute various graph metrics exactly or with good approximation. By computing on a smaller graph, we improve the performance of graph analytics applications on two different systems, a batch graph processing system and a graph database. Further, we provide an algorithm for the computation of the metric backbone on large graphs. While one can compute the metric backbone by solving the all-pairs-shortest-paths problem, this approach incurs prohibitive time and space complexity for big graphs. Instead, we propose a heuristic that makes computing the metric backbone practical even for large graphs. Additionally, we analyze several real datasets of different sizes and domains and we show that we can approximate the metric backbone by removing only first-order semi-metric edges; edges for which a shorter two-hop path exists. We provide a distributed implementation of our algorithm and apply it in large scale scenarios. We evaluate our algorithm using a variety of real graphs, including a Facebook social network subgraph of $ \approx $50 billion edges. We measure the impact of using the metric backbone on runtime performance in two graph management systems. We achieve query speedups of up to 6.7x in the Neo4j commercial graph database and job speedups of up to 6x in the Giraph graph processing system.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Papadakis:2016:CAA, author = "George Papadakis and Jonathan Svirsky and Avigdor Gal and Themis Palpanas", title = "Comparative analysis of approximate blocking techniques for entity resolution", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "9", pages = "684--695", month = may, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu May 26 16:06:05 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Entity Resolution is a core task for merging data collections. Due to its quadratic complexity, it typically scales to large volumes of data through blocking: similar entities are clustered into blocks and pair-wise comparisons are executed only between co-occurring entities, at the cost of some missed matches. There are numerous blocking methods, and the aim of this work is to offer a comprehensive empirical survey, extending the dimensions of comparison beyond what is commonly available in the literature. We consider 17 state-of-the-art blocking methods and use 6 popular real datasets to examine the robustness of their internal configurations and their relative balance between effectiveness and time efficiency. We also investigate their scalability over a corpus of 7 established synthetic datasets that range from 10,000 to 2 million entities.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhao:2016:EED, author = "Yiran Zhao and Shen Li and Shaohan Hu and Hongwei Wang and Shuochao Yao and Huajie Shao and Tarek Abdelzaher", title = "An experimental evaluation of datacenter workloads on low-power embedded micro servers", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "9", pages = "696--707", month = may, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu May 26 16:06:05 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents a comprehensive evaluation of an ultra-low power cluster, built upon the Intel Edison based micro servers. The improved performance and high energy efficiency of micro servers have driven both academia and industry to explore the possibility of replacing conventional brawny servers with a larger swarm of embedded micro servers. Existing attempts mostly focus on mobile-class micro servers, whose capacities are similar to mobile phones. We, on the other hand, target on sensor-class micro servers, which are originally intended for uses in wearable technologies, sensor networks, and Internet-of-Things. Although sensor-class micro servers have much less capacity, they are touted for minimal power consumption (< 1 Watt), which opens new possibilities of achieving higher energy efficiency in datacenter workloads. Our systematic evaluation of the Edison cluster and comparisons to conventional brawny clusters involve careful workload choosing and laborious parameter tuning, which ensures maximum server utilization and thus fair comparisons. Results show that the Edison cluster achieves up to 3.5x improvement on work-done-per-joule for web service applications and data-intensive MapReduce jobs. In terms of scalability, the Edison cluster scales linearly on the throughput of web service workloads, and also shows satisfactory scalability for MapReduce workloads despite coordination overhead.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Song:2016:CTT, author = "Shaoxu Song and Yue Cao and Jianmin Wang", title = "Cleaning timestamps with temporal constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "10", pages = "708--719", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2977797.2977798", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Timestamps are often found to be dirty in various scenarios, e.g., in distributed systems with clock synchronization problems or unreliable RFID readers. Without cleaning the imprecise timestamps, temporal-related applications such as provenance analysis or pattern queries are not reliable. To evaluate the correctness of timestamps, temporal constraints could be employed, which declare the distance restrictions between timestamps. Guided by such constraints on timestamps, in this paper, we study a novel problem of repairing inconsistent timestamps that do not conform to the required temporal constraints. Following the same line of data repairing, the timestamp repairing problem is to minimally modify the timestamps towards satisfaction of temporal constraints. This problem is practically challenging, given the huge space of possible timestamps. We tackle the problem by identifying a concise set of promising candidates, where an optimal repair solution can always be found. Repair algorithms with efficient pruning are then devised over the identified candidates. Experiments on real datasets demonstrate the superiority of our proposal compared to the state-of-the-art approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tan:2016:TRS, author = "Zilong Tan and Shivnath Babu", title = "{Tempo}: robust and self-tuning resource management in multi-tenant parallel databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "10", pages = "720--731", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2977797.2977799", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Multi-tenant database systems have a component called the Resource Manager, or RM that is responsible for allocating resources to tenants. RMs today do not provide direct support for performance objectives such as: ``Average job response time of tenant A must be less than two minutes'', or ``No more than 5\% of tenant B's jobs can miss the deadline of 1 hour.'' Thus, DBAs have to tinker with the RM's low-level configuration settings to meet such objectives. We propose a framework called Tempo that brings simplicity, self-tuning, and robustness to existing RMs. Tempo provides a simple interface for DBAs to specify performance objectives declaratively, and optimizes the RM configuration settings to meet these objectives. Tempo has a solid theoretical foundation which gives key robustness guarantees. We report experiments done on Tempo using production traces of data-processing workloads from companies such as Facebook and Cloudera. These experiments demonstrate significant improvements in meeting desired performance objectives over RM configuration settings specified by human experts.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Daenen:2016:PEM, author = "Jonny Daenen and Frank Neven and Tony Tan and Stijn Vansummeren", title = "Parallel evaluation of multi-semi-joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "10", pages = "732--743", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2977797.2977800", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "While services such as Amazon AWS make computing power abundantly available, adding more computing nodes can incur high costs in, for instance, pay-as-you-go plans while not always significantly improving the net running time (aka wall-clock time) of queries. In this work, we provide algorithms for parallel evaluation of SGF queries in MapReduce that optimize total time, while retaining low net time. Not only can SGF queries specify all semi-join reducers, but also more expressive queries involving disjunction and negation. Since SGF queries can be seen as Boolean combinations of (potentially nested) semi-joins, we introduce a novel multi-semi-join (MSJ) MapReduce operator that enables the evaluation of a set of semi-joins in one job. We use this operator to obtain parallel query plans for SGF queries that outvalue sequential plans w.r.t. net time and provide additional optimizations aimed at minimizing total time without severely affecting net time. Even though the latter optimizations are NP-hard, we present effective greedy algorithms. Our experiments, conducted using our own implementation Gumbo on top of Hadoop, confirm the usefulness of parallel query plans, and the effectiveness and scalability of our optimizations, all with a significant improvement over Pig and Hive.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2016:WCE, author = "Jianfei Chen and Kaiwei Li and Jun Zhu and Wenguang Chen", title = "{WarpLDA}: a cache efficient {O(1)} algorithm for latent {Dirichlet} allocation", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "10", pages = "744--755", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2977797.2977801", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Developing efficient and scalable algorithms for Latent Dirichlet Allocation (LDA) is of wide interest for many applications. Previous work has developed an $ O(1) $ Metropolis--Hastings (MH) sampling method for each token. However, its performance is far from being optimal due to frequent cache misses caused by random accesses to the parameter matrices. In this paper, we first carefully analyze the memory access behavior of existing algorithms for LDA by cache locality at document level. We then develop WarpLDA, which achieves $ O(1) $ time complexity per-token and fits the randomly accessed memory per document in the L3 cache. Our empirical results in a wide range of testing conditions demonstrate that WarpLDA is consistently 5-15x faster than the state-of-the-art MH-based LightLDA, and is faster than the state-of-the-art sparsity aware F+LDA in most settings. Our WarpLDA learns a million topics from 639 millions of documents in only five hours at an unprecedented throughput of 11 billion tokens per second.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Eich:2016:FPG, author = "Marius Eich and Pit Fender and Guido Moerkotte", title = "Faster plan generation through consideration of functional dependencies and keys", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "10", pages = "756--767", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2977797.2977802", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "It has been a recognized fact for many years that query execution can benefit from pushing group-by operators down in the operator tree and applying them before a join. This so-called eager aggregation reduces the size(s) of the join argument(s), making join evaluation faster. Lately, the idea enjoyed a revival when it was applied to outer joins for the first time and incorporated in a state-of-the-art plan generator. However, this recent approach is highly dependent on the use of heuristics because of the exponential growth of the search space that goes along with eager aggregation. Finding an optimal solution for larger queries calls for effective optimality preserving pruning mechanisms to reduce the search space size as far as possible. By a more thorough investigation of functional dependencies and keys, we provide a set of new pruning criteria and evaluate their effectiveness with respect to the runtime and memory consumption of the resulting plan generator.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Schuhknecht:2016:RIR, author = "Felix Martin Schuhknecht and Jens Dittrich and Ankur Sharma", title = "{RUMA} has it: rewired user-space memory access is possible!", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "10", pages = "768--779", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2977797.2977803", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Memory management is one of the most boring topics in database research. It plays a minor role in tasks like free-space management or efficient space usage. Here and there we also realize its impact on database performance when worrying about NUMA-aware memory allocation, data compacting, snapshotting, and defragmentation. But, overall, let's face it: the entire topic sounds as exciting as 'garbage collection' or 'debugging a program for memory leaks'. What if there were a technique that would promote memory management from a third class helper thingie to a first class citizen in algorithm and systems design? What if that technique turned the role of memory management in a database system (and any other data processing system) upside-down? What if that technique could be identified as a key for re-designing various core algorithms with the effect of outperforming existing state-of-the-art methods considerably? Then we would write this paper. We introduce RUMA: Rewired User-space Memory Access. It allows for physiological data management, i.e. we allow developers to freely rewire the mappings from virtual to physical memory (in user space) while at the same time exploiting the virtual memory support offered by hardware and operating system. We show that fundamental database building blocks such as array operations, partitioning, sorting, and snapshotting benefit strongly from RUMA.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Marcus:2016:WLB, author = "Ryan Marcus and Olga Papaemmanouil", title = "{WiSeDB}: a learning-based workload management advisor for cloud databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "10", pages = "780--791", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2977797.2977804", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Workload management for cloud databases deals with the tasks of resource provisioning, query placement, and query scheduling in a manner that meets the application's performance goals while minimizing the cost of using cloud resources. Existing solutions have approached these three challenges in isolation while aiming to optimize a single performance metric. In this paper, we introduce WiSeDB, a learning-based framework for generating holistic workload management solutions customized to application-defined performance goals and workload characteristics. Our approach relies on supervised learning to train cost-effective decision tree models for guiding query placement, scheduling, and resource provisioning decisions. Applications can use these models for both batch and online scheduling of incoming workloads. A unique feature of our system is that it can adapt its offline model to stricter/looser performance goals with minimal re-training. This allows us to present to the application alternative workload management strategies that address the typical performance vs. cost trade-off of cloud services. Experimental results show that our approach has very low training overhead while offering low cost strategies for a variety of performance metrics and workload characteristics.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{DeFrancisciMorales:2016:SSS, author = "Gianmarco {De Francisci Morales} and Aristides Gionis", title = "Streaming similarity self-join", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "10", pages = "792--803", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2977797.2977805", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We introduce and study the problem of computing the similarity self-join in a streaming context (SSSJ), where the input is an unbounded stream of items arriving continuously. The goal is to find all pairs of items in the stream whose similarity is greater than a given threshold. The simplest formulation of the problem requires unbounded memory, and thus, it is intractable. To make the problem feasible, we introduce the notion of time-dependent similarity: the similarity of two items decreases with the difference in their arrival time. By leveraging the properties of this time-dependent similarity function, we design two algorithmic frameworks to solve the SSSJ problem. The first one, MiniBatch (MB), uses existing index-based filtering techniques for the static version of the problem, and combines them in a pipeline. The second framework, Streaming (STR), adds time filtering to the existing indexes, and integrates new time-based bounds deeply in the working of the algorithms. We also introduce a new indexing technique (L2), which is based on an existing state-of-the-art indexing technique (L2AP), but is optimized for the streaming case. Extensive experiments show that the STR algorithm, when instantiated with the L2 index, is the most scalable option across a wide array of datasets and parameters.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Schatzle:2016:SRQ, author = "Alexander Sch{\"a}tzle and Martin Przyjaciel-Zablocki and Simon Skilevic and Georg Lausen", title = "{S2RDF}: {RDF} querying with {SPARQL} on spark", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "10", pages = "804--815", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2977797.2977806", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "RDF has become very popular for semantic data publishing due to its flexible and universal graph-like data model. Thus, the ever-increasing size of RDF data collections raises the need for scalable distributed approaches. We endorse the usage of existing infrastructures for Big Data processing like Hadoop for this purpose. Yet, SPARQL query performance is a major challenge as Hadoop is not intentionally designed for RDF processing. Existing approaches often favor certain query pattern shapes while performance drops significantly for other shapes. In this paper, we introduce a novel relational partitioning schema for RDF data called ExtVP that uses a semi-join based preprocessing, akin to the concept of Join Indices in relational databases, to efficiently minimize query input size regardless of its pattern shape and diameter. Our prototype system S2RDF is built on top of Spark and uses SQL to execute SPARQL queries over ExtVP. We demonstrate its superior performance in comparison to state of the art SPARQL-on-Hadoop approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Singh:2016:BSS, author = "Rishabh Singh", title = "{BlinkFill}: semi-supervised programming by example for syntactic string transformations", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "10", pages = "816--827", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2977797.2977807", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The recent Programming By Example (PBE) techniques such as FlashFill have shown great promise for enabling end-users to perform data transformation tasks using input-output examples. Since examples are inherently an under-specification, there are typically a large number of hypotheses conforming to the examples, and the PBE techniques suffer from scalability issues for finding the intended program amongst the large space. We present a semi-supervised learning technique to significantly reduce this ambiguity by using the logical information present in the input data to guide the synthesis algorithm. We develop a data structure InputDataGraph to succinctly represent a large set of logical patterns that are shared across the input data, and use this graph to efficiently learn substring expressions in a new PBE system B linkFill. We evaluate BlinkFill on 207 real-world benchmarks and show that BlinkFill is significantly faster (on average 41x) and requires fewer input-output examples (1.27 vs 1.53) to learn the desired transformations in comparison to FlashFill.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deng:2016:MEM, author = "Dong Deng and Guoliang Li and He Wen and H. V. Jagadish and Jianhua Feng", title = "{META}: an efficient matching-based method for error-tolerant autocompletion", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "10", pages = "828--839", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2977797.2977808", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Autocompletion has been widely adopted in many computing systems because it can instantly provide users with results as users type in queries. Since the typing task is tedious and prone to error, especially on mobile devices, a recent trend is to tolerate errors in autocompletion. Existing error-tolerant autocompletion methods build a trie to index the data, utilize the trie index to compute the trie nodes that are similar to the query, called active nodes, and identify the leaf descendants of active nodes as the results. However these methods have two limitations. First, they involve many redundant computations to identify the active nodes. Second, they do not support top- k queries. To address these problems, we propose a matching-based framework, which computes the answers based on matching characters between queries and data. We design a compact tree index to maintain active nodes in order to avoid the redundant computations. We devise an incremental method to efficiently answer top- k queries. Experimental results on real datasets show that our method outperforms state-of-the-art approaches by 1--2 orders of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zheng:2016:SSS, author = "Weiguo Zheng and Lei Zou and Wei Peng and Xifeng Yan and Shaoxu Song and Dongyan Zhao", title = "Semantic {SPARQL} similarity search over {RDF} knowledge graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "11", pages = "840--851", month = jul, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2983200.2983201", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "RDF knowledge graphs have attracted increasing attentions these years. However, due to the schema-free nature of RDF data, it is very difficult for users to have full knowledge of the underlying schema. Furthermore, the same kind of information can be represented in diverse graph fragments. Hence, it is a huge challenge to formulate complex SPARQL expressions by taking the union of all possible structures. In this paper, we propose an effective framework to access the RDF repository even if users have no full knowledge of the underlying schema. Specifically, given a SPARQL query, the system could return as more answers that match the query based on the semantic similarity as possible. Interestingly, we propose a systematic method to mine diverse semantically equivalent structure patterns. More importantly, incorporating both structural and semantic similarities we are the first to propose a novel similarity measure, semantic graph edit distance. In order to improve the efficiency performance, we apply the semantic summary graph to summarize the knowledge graph, which supports both high-level pruning and drill-down pruning. We also devise an effective lower bound based on the TA-style access to each of the candidate sets. Extensive experiments over real datasets confirm the effectiveness and efficiency of our approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dubey:2016:WHP, author = "Ayush Dubey and Greg D. Hill and Robert Escriva and Emin G{\"u}n Sirer", title = "{Weaver}: a high-performance, transactional graph database based on refinable timestamps", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "11", pages = "852--863", month = jul, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2983200.2983202", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph databases have become a common infrastructure component. Yet existing systems either operate on offline snapshots, provide weak consistency guarantees, or use expensive concurrency control techniques that limit performance. In this paper, we introduce a new distributed graph database, called Weaver, which enables efficient, transactional graph analyses as well as strictly serializable ACID transactions on dynamic graphs. The key insight that allows Weaver to combine strict serializability with horizontal scalability and high performance is a novel request ordering mechanism called refinable timestamps. This technique couples coarse-grained vector timestamps with a fine-grained timeline oracle to pay the overhead of strong consistency only when needed. Experiments show that Weaver enables a Bitcoin blockchain explorer that is 8x faster than Blockchain.info, and achieves 10.9x higher throughput than the Titan graph database on social network workloads and 4x lower latency than GraphLab on offline graph traversal workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chu:2016:DDD, author = "Xu Chu and Ihab F. Ilyas and Paraschos Koutris", title = "Distributed data deduplication", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "11", pages = "864--875", month = jul, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2983200.2983203", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data deduplication refers to the process of identifying tuples in a relation that refer to the same real world entity. The complexity of the problem is inherently quadratic with respect to the number of tuples, since a similarity value must be computed for every pair of tuples. To avoid comparing tuple pairs that are obviously non-duplicates, blocking techniques are used to divide the tuples into blocks and only tuples within the same block are compared. However, even with the use of blocking, data deduplication remains a costly problem for large datasets. In this paper, we show how to further speed up data deduplication by leveraging parallelism in a shared-nothing computing environment. Our main contribution is a distribution strategy, called Dis-Dedup, that minimizes the maximum workload across all worker nodes and provides strong theoretical guarantees. We demonstrate the effectiveness of our proposed strategy by performing extensive experiments on both synthetic datasets with varying block size distributions, as well as real world datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Arenas:2016:FAC, author = "Marcelo Arenas and Francisco Maturana and Cristian Riveros and Domagoj Vrgoc", title = "A framework for annotating {CSV}-like data", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "11", pages = "876--887", month = jul, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2983200.2983204", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we propose a simple and expressive framework for adding metadata to CSV documents and their noisy variants. The framework is based on annotating parts of the document that can be later used to read, query, or exchange the data. The core of our framework is a language based on extended regular expressions that are used for selecting data. These expressions are then combined using a set of rules in order to annotate the data. We study the computational complexity of implementing our framework and present an efficient evaluation algorithm that runs in time proportional to its output and linear in its input. As a proof of concept, we test an implementation of our framework against a large number of real world datasets and show that it can be efficiently used in practice.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Asudeh:2016:QRS, author = "Abolfazl Asudeh and Nan Zhang and Gautam Das", title = "Query reranking as a service", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "11", pages = "888--899", month = jul, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2983200.2983205", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The ranked retrieval model has rapidly become the de facto way for search query processing in client-server databases, especially those on the web. Despite of the extensive efforts in the database community on designing better ranking functions/mechanisms, many such databases in practice still fail to address the diverse and sometimes contradicting preferences of users on tuple ranking, perhaps (at least partially) due to the lack of expertise and/or motivation for the database owner to design truly effective ranking functions. This paper takes a different route on addressing the issue by defining a novel query reranking problem, i.e., we aim to design a third-party service that uses nothing but the public search interface of a client-server database to enable the on-the-fly processing of queries with any user-specified ranking functions (with or without selection conditions), no matter if the ranking function is supported by the database or not. We analyze the worst-case complexity of the problem and introduce a number of ideas, e.g., on-the-fly indexing, domination detection and virtual tuple pruning, to reduce the average-case cost of the query reranking algorithm. We also present extensive experimental results on real-world datasets, in both offline and live online systems, that demonstrate the effectiveness of our proposed techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ma:2016:GSF, author = "Hongbin Ma and Bin Shao and Yanghua Xiao and Liang Jeff Chen and Haixun Wang", title = "{G-SQL}: fast query processing via graph exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "900--911", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994510", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A lot of real-life data are of graph nature. However, it is not until recently that business begins to exploit data's connectedness for business insights. On the other hand, RDBMSs are a mature technology for data management, but they are not for graph processing. Take graph traversal, a common graph operation for example, it heavily relies on a graph primitive that accesses a given node's neighborhood. We need to join tables following foreign keys to access the nodes in the neighborhood if an RDBMS is used to manage graph data. Graph exploration is a fundamental building block of many graph algorithms. But this simple operation is costly due to a large volume of I/O caused by the massive amount of table joins. In this paper, we present G-SQL, our effort toward the integration of a RDBMS and a native in-memory graph processing engine. G-SQL leverages the fast graph exploration capability provided by the graph engine to answer multi-way join queries. Meanwhile, it uses RDBMSs to provide mature data management functionalities, such as reliable data storage and additional data access methods. Specifically, G-SQL is a SQL dialect augmented with graph exploration functionalities and it dispatches query tasks to the in-memory graph engine and its underlying RDMBS. The G-SQL runtime coordinates the two query processors via a unified cost model to ensure the entire query is processed efficiently. Experimental results show that our approach greatly expands capabilities of RDBMs and delivers exceptional performance for SQL-graph hybrid queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2016:MOD, author = "Mingxing Zhang and Yongwei Wu and Kang Chen and Teng Ma and Weimin Zheng", title = "Measuring and optimizing distributed array programs", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "912--923", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994511", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Nowadays, there is a rising trend of building array-based distributed computing frameworks, which are suitable for implementing many machine learning and data mining algorithms. However, most of these frameworks only execute each primitive in an isolated manner and in the exact order defined by programmers, which implies a huge space for optimization. In this paper, we propose a novel array-based programming model, named K asen, which distinguishes itself from models in the existing literature by defining a strict computation and communication model. This model makes it easy to analyze programs' behavior and measure their performance, with which we design a corresponding optimizer that can automatically apply high-level optimizations to the original programs written by programmers. According to our evaluation, the optimizer of Kasen can achieve a significant reduction on memory read/write, buffer allocation and network traffic, which leads to a speedup up to 5.82x.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jo:2016:YHP, author = "Insoon Jo and Duck-Ho Bae and Andre S. Yoon and Jeong-Uk Kang and Sangyeun Cho and Daniel D. G. Lee and Jaeheon Jeong", title = "{YourSQL}: a high-performance database system leveraging in-storage computing", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "924--935", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994512", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents YourSQL, a database system that accelerates data-intensive queries with the help of additional in-storage computing capabilities. YourSQL realizes very early filtering of data by offloading data scanning of a query to user-programmable solid-state drives. We implement our system on a recent branch of MariaDB (a variant of MySQL). In order to quantify the performance gains of YourSQL, we evaluate SQL queries with varying complexities. Our result shows that YourSQL reduces the execution time of the whole TPC-H queries by $ 3.6 \times $, compared to a vanilla system. Moreover, the average speed-up of the five TPC-H queries with the largest performance gains reaches over $ 15 \times $. Thanks to this significant reduction of execution time, we observe sizable energy savings. Our study demonstrates that the YourSQL approach, combining the power of early filtering with end-to-end datapath optimization, can accelerate large-scale analytic queries with lower energy consumption.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lu:2016:LBM, author = "Lu Lu and Xuanhua Shi and Yongluan Zhou and Xiong Zhang and Hai Jin and Cheng Pei and Ligang He and Yuanzhen Geng", title = "Lifetime-based memory management for distributed data processing systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "936--947", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994513", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In-memory caching of intermediate data and eager combining of data in shuffle buffers have been shown to be very effective in minimizing the re-computation and I/O cost in distributed data processing systems like Spark and Flink. However, it has also been widely reported that these techniques would create a large amount of long-living data objects in the heap, which may quickly saturate the garbage collector, especially when handling a large dataset, and hence would limit the scalability of the system. To eliminate this problem, we propose a lifetime-based memory management framework, which, by automatically analyzing the user-defined functions and data types, obtains the expected lifetime of the data objects, and then allocates and releases memory space accordingly to minimize the garbage collection overhead. In particular, we present Deca, a concrete implementation of our proposal on top of Spark, which transparently decomposes and groups objects with similar lifetimes into byte arrays and releases their space altogether when their lifetimes come to an end. An extensive experimental study using both synthetic and real datasets shows that, in comparing to Spark, Deca is able to (1) reduce the garbage collection time by up to 99.9\%, (2) to achieve up to 22.7x speed up in terms of execution time in cases without data spilling and 41.6x speedup in cases with data spilling, and (3) to consume up to 46.6\% less memory.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Krishnan:2016:AID, author = "Sanjay Krishnan and Jiannan Wang and Eugene Wu and Michael J. Franklin and Ken Goldberg", title = "{ActiveClean}: interactive data cleaning for statistical modeling", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "948--959", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994514", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Analysts often clean dirty data iteratively--cleaning some data, executing the analysis, and then cleaning more data based on the results. We explore the iterative cleaning process in the context of statistical model training, which is an increasingly popular form of data analytics. We propose ActiveClean, which allows for progressive and iterative cleaning in statistical modeling problems while preserving convergence guarantees. ActiveClean supports an important class of models called convex loss models (e.g., linear regression and SVMs), and prioritizes cleaning those records likely to affect the results. We evaluate ActiveClean on five real-world datasets UCI Adult, UCI EEG, MNIST, IMDB, and Dollars For Docs with both real and synthetic errors. The results show that our proposed optimizations can improve model accuracy by up-to 2.5x for the same amount of data cleaned. Furthermore for a fixed cleaning budget and on all real dirty datasets, ActiveClean returns more accurate models than uniform sampling and Active Learning.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Elgohary:2016:CLA, author = "Ahmed Elgohary and Matthias Boehm and Peter J. Haas and Frederick R. Reiss and Berthold Reinwald", title = "Compressed linear algebra for large-scale machine learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "960--971", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994515", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large-scale machine learning (ML) algorithms are often iterative, using repeated read-only data access and I/O-bound matrix-vector multiplications to converge to an optimal model. It is crucial for performance to fit the data into single-node or distributed main memory. General-purpose, heavy- and lightweight compression techniques struggle to achieve both good compression ratios and fast decompression speed to enable block-wise uncompressed operations. Hence, we initiate work on compressed linear algebra (CLA), in which lightweight database compression techniques are applied to matrices and then linear algebra operations such as matrix-vector multiplication are executed directly on the compressed representations. We contribute effective column compression schemes, cache-conscious operations, and an efficient sampling-based compression algorithm. Our experiments show that CLA achieves in-memory operations performance close to the uncompressed case and good compression ratios that allow us to fit larger datasets into available memory. We thereby obtain significant end-to-end performance improvements up to 26x or reduced memory requirements.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Karpathiotakis:2016:FQH, author = "Manos Karpathiotakis and Ioannis Alagiannis and Anastasia Ailamaki", title = "Fast queries over heterogeneous data through engine customization", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "972--983", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994516", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Industry and academia are continuously becoming more data-driven and data-intensive, relying on the analysis of a wide variety of heterogeneous datasets to gain insights. The different data models and formats pose a significant challenge on performing analysis over a combination of diverse datasets. Serving all queries using a single, general-purpose query engine is slow. On the other hand, using a specialized engine for each heterogeneous dataset increases complexity: queries touching a combination of datasets require an integration layer over the different engines. This paper presents a system design that natively supports heterogeneous data formats and also minimizes query execution times. For multi-format support, the design uses an expressive query algebra which enables operations over various data models. For minimal execution times, it uses a code generation mechanism to mimic the system and storage most appropriate to answer a query fast. We validate our design by building Proteus, a query engine which natively supports queries over CSV, JSON, and relational binary data, and which specializes itself to each query, dataset, and workload via code generation. Proteus outperforms state-of-the-art open-source and commercial systems on both synthetic and real-world workloads without being tied to a single data model or format, all while exposing users to a single query interface.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bhowmick:2016:DDV, author = "Sourav S. Bhowmick and Byron Choi and Curtis Dyreson", title = "Data-driven visual graph query interface construction and maintenance: challenges and opportunities", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "984--992", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994517", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Visual query interfaces make it easy for scientists and other nonexpert users to query a data collection. Heretofore, visual query interfaces have been statically-constructed, independent of the data. In this paper we outline a vision of a different kind of interface, one that is built (in part) from the data. In our data-driven approach, the visual interface is dynamically constructed and maintained. A data-driven approach has many benefits such as reducing the cost in constructing and maintaining an interface, superior support for query formulation, and increased portability of the interface. We focus on graph databases, but our approach is applicable to several other kinds of databases such as JSON and XML.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abedjan:2016:DDE, author = "Ziawasch Abedjan and Xu Chu and Dong Deng and Raul Castro Fernandez and Ihab F. Ilyas and Mourad Ouzzani and Paolo Papotti and Michael Stonebraker and Nan Tang", title = "Detecting data errors: where are we and what needs to be done?", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "993--1004", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994518", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data cleaning has played a critical role in ensuring data quality for enterprise applications. Naturally, there has been extensive research in this area, and many data cleaning algorithms have been translated into tools to detect and to possibly repair certain classes of errors such as outliers, duplicates, missing values, and violations of integrity constraints. Since different types of errors may coexist in the same data set, we often need to run more than one kind of tool. In this paper, we investigate two pragmatic questions: (1) are these tools robust enough to capture most errors in real-world data sets? and (2) what is the best strategy to holistically run multiple tools to optimize the detection effort? To answer these two questions, we obtained multiple data cleaning tools that utilize a variety of error detection techniques. We also collected five real-world data sets, for which we could obtain both the raw data and the ground truth on existing errors. In this paper, we report our experimental findings on the errors detected by the tools we tested. First, we show that the coverage of each tool is well below 100\%. Second, we show that the order in which multiple tools are run makes a big difference. Hence, we propose a holistic multi-tool strategy that orders the invocations of the available tools to maximize their benefit, while minimizing human effort in verifying results. Third, since this holistic approach still does not lead to acceptable error coverage, we discuss two simple strategies that have the potential to improve the situation, namely domain specific tools and data enrichment. We close this paper by reasoning about the errors that are not detectable by any of the tools we tested.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2016:ESH, author = "Hai Liu and Dongqing Xiao and Pankaj Didwania and Mohamed Y. Eltabakh", title = "Exploiting soft and hard correlations in big data query optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1005--1016", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994519", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Big data infrastructures are increasingly supporting datasets that are relatively structured. These datasets are full of correlations among their attributes, which if managed in systematic ways would enable optimization opportunities that otherwise will be missed. Unlike relational databases in which discovering and exploiting the correlations in query optimization have been extensively studied, in big data infrastructures, such important data properties and their utilization have been mostly abandoned. The key reason is that domain experts may know many correlations but with a degree of uncertainty (fuzziness or softness). Since the data is big, it is very challenging to validate such correlations, judge their worthiness, and put strategies for utilizing them in query optimization. Existing techniques for exploiting soft correlations in RDBMSs, e.g., BHUNT, CORDS, and CM, are heavily tailored towards optimizing factors inherent in relational databases, e.g., predicate selectivity and random I/O accesses of secondary indexes, which are issues not applicable to big data infrastructures, e.g., Hadoop. In this paper, we propose the EXORD system to fill in this gap by exploiting the data's correlations in big data query optimization. EXORD supports two types of correlations; hard correlations---which are guaranteed to hold for all data records, and soft correlations---which are expected to hold for most, but not all, data records. We introduce a new three-phase approach for (1) Validating and judging the worthiness of soft correlations, (2) Selecting and preparing the soft correlations for deployment by specially handling the violating data records, and (3) Deploying and exploiting the correlations in query optimization. We propose a novel cost-benefit model for adaptively selecting the most beneficial soft correlations w.r.t a given query workload while minimizing the introduced overhead. We show the complexity of this problem (NP-Hard), and propose a heuristic to efficiently solve it in a polynomial time. EXORD can be integrated with various state-of-art big data query optimization techniques, e.g., indexing and partitioning. EXORD prototype is implemented as an extension to the Hive engine on top of Hadoop. The experimental evaluation shows the potential of EXORD in achieving more than 10x speedup while introducing minimal storage overheads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kahng:2016:IBN, author = "Minsuk Kahng and Shamkant B. Navathe and John T. Stasko and Duen Horng Polo Chau", title = "Interactive browsing and navigation in relational databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1017--1028", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994520", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Although researchers have devoted considerable attention to helping database users formulate queries, many users still find it challenging to specify queries that involve joining tables. To help users construct join queries for exploring relational databases, we propose ETable, a novel presentation data model that provides users with a presentation-level interactive view. This view compactly presents one-to-many and many-to-many relationships within a single enriched table by allowing a cell to contain a set of entity references. Users can directly interact with this enriched table to incrementally construct complex queries and navigate databases on a conceptual entity-relationship level. In a user study, participants performed a range of database querying tasks faster with ETable than with a commercial graphical query builder. Subjective feedback about ETable was also positive. All participants found that ETable was easier to learn and helpful for exploring databases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Borovica-Gajic:2016:CDA, author = "Renata Borovica-Gaji{\'c} and Raja Appuswamy and Anastasia Ailamaki", title = "Cheap data analytics using cold storage devices", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1029--1040", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994521", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Enterprise databases use storage tiering to lower capital and operational expenses. In such a setting, data waterfalls from an SSD-based high-performance tier when it is ``hot'' (frequently accessed) to a disk-based capacity tier and finally to a tape-based archival tier when ``cold'' (rarely accessed). To address the unprecedented growth in the amount of cold data, hardware vendors introduced new devices named Cold Storage Devices (CSD) explicitly targeted at cold data workloads. With access latencies in tens of seconds and cost/GB as low as \$0.01/GB/month, CSD provide a middle ground between the low-latency (ms), high-cost, HDD-based capacity tier, and high-latency (min to h), low-cost, tape-based, archival tier. Driven by the price/performance aspect of CSD, this paper makes a case for using CSD as a replacement for both capacity and archival tiers of enterprise databases. Although CSD offer major cost savings, we show that current database systems can suffer from severe performance drop when CSD are used as a replacement for HDD due to the mismatch between design assumptions made by the query execution engine and actual storage characteristics of the CSD. We then build a CSD-driven query execution framework, called Skipper, that modifies both the database execution engine and CSD scheduling algorithms to be aware of each other. Using results from our implementation of the architecture based on PostgreSQL and OpenStack Swift, we show that Skipper is capable of completely masking the high latency overhead of CSD, thereby opening up CSD for wider adoption as a storage tier for cheap data analytics over cold data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shun:2016:PLG, author = "Julian Shun and Farbod Roosta-Khorasani and Kimon Fountoulakis and Michael W. Mahoney", title = "Parallel local graph clustering", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1041--1052", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994522", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph clustering has many important applications in computing, but due to growing sizes of graph, even traditionally fast clustering methods such as spectral partitioning can be computationally expensive for real-world graphs of interest. Motivated partly by this, so-called local algorithms for graph clustering have received significant interest due to the fact that they can find good clusters in a graph with work proportional to the size of the cluster rather than that of the entire graph. This feature has proven to be crucial in making such graph clustering and many of its downstream applications efficient in practice. While local clustering algorithms are already faster than traditional algorithms that touch the entire graph, they are sequential and there is an opportunity to make them even more efficient via parallelization. In this paper, we show how to parallelize many of these algorithms in the shared-memory multicore setting, and we analyze the parallel complexity of these algorithms. We present comprehensive experiments on large-scale graphs showing that our parallel algorithms achieve good parallel speedups on a modern multicore machine, thus significantly speeding up the analysis of local graph clusters in the very large-scale setting.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tong:2016:OMM, author = "Yongxin Tong and Jieying She and Bolin Ding and Lei Chen and Tianyu Wo and Ke Xu", title = "Online minimum matching in real-time spatial data: experiments and analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1053--1064", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994523", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently, with the development of mobile Internet and smartphones, the online minimum bipartite matching in real time spatial data (OMBM) problem becomes popular. Specifically, given a set of service providers with specific locations and a set of users who dynamically appear one by one, the OMBM problem is to find a maximum-cardinality matching with minimum total distance following that once a user appears, s/he must be immediately matched to an unmatched service provider, which cannot be revoked, before subsequent users arrive. To address this problem, existing studies mainly focus on analyzing the worst-case competitive ratios of the proposed online algorithms, but study on the performance of the algorithms in practice is absent. In this paper, we present a comprehensive experimental comparison of the representative algorithms of the OMBM problem. Particularly, we observe a surprising result that the simple and efficient greedy algorithm, which has been considered as the worst due to its exponential worst-case competitive ratio, is significantly more effective than other algorithms. We investigate the results and further show that the competitive ratio of the worst case of the greedy algorithm is actually just a constant, 3.195, in the average-case analysis. We try to clarify a 25-year misunderstanding towards the greedy algorithm and justify that the greedy algorithm is not bad at all. Finally, we provide a uniform implementation for all the algorithms of the OMBM problem and clarify their strengths and weaknesses, which can guide practitioners to select appropriate algorithms for various scenarios.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Brunel:2016:IAH, author = "Robert Brunel and Norman May and Alfons Kemper", title = "Index-assisted hierarchical computations in main-memory {RDBMS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1065--1076", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994524", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We address the problem of expressing and evaluating computations on hierarchies represented as database tables. Engine support for such computations is very limited today, and so they are usually outsourced into stored procedures or client code. Recently, data model and SQL language extensions were proposed to conveniently represent and work with hierarchies. On that basis we introduce a concept of structural grouping to relational algebra, provide concise syntax to express a class of useful computations, and discuss algorithms to evaluate them efficiently by exploiting available indexing schemes. This extends the versatility of RDBMS towards a great many use cases dealing with hierarchical data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ohsaka:2016:DIA, author = "Naoto Ohsaka and Takuya Akiba and Yuichi Yoshida and Ken-ichi Kawarabayashi", title = "Dynamic influence analysis in evolving networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1077--1088", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994525", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose the first real-time fully-dynamic index data structure designed for influence analysis on evolving networks. With this aim, we carefully redesign the data structure of the state-of-the-art sketching method introduced by Borgs et al., and construct corresponding update algorithms. Using this index, we present algorithms for two kinds of queries, influence estimation and influence maximization, which are strongly motivated by practical applications, such as viral marketing. We provide a thorough theoretical analysis, which guarantees the non-degeneracy of the solution accuracy after an arbitrary number of updates. Furthermore, we introduce a reachability-tree-based technique and a skipping method, which greatly reduce the time consumption required for edge/vertex deletions and vertex additions, respectively, and counter-based random number generators, which improve the space efficiency. Experimental evaluations using real dynamic networks with tens of millions of edges demonstrate the efficiency, scalability, and accuracy of our proposed indexing scheme. Specifically, it can reflect a graph modification within a time of several orders of magnitude smaller than that required to reconstruct an index from scratch, estimate the influence spread of a vertex set accurately within a millisecond, and select highly influential vertices at least ten times faster than state-of-the-art static algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tran:2016:DBO, author = "Luan Tran and Liyue Fan and Cyrus Shahabi", title = "Distance-based outlier detection in data streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1089--1100", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994526", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Continuous outlier detection in data streams has important applications in fraud detection, network security, and public health. The arrival and departure of data objects in a streaming manner impose new challenges for outlier detection algorithms, especially in time and space efficiency. In the past decade, several studies have been performed to address the problem of distance-based outlier detection in data streams (DODDS), which adopts an unsupervised definition and does not have any distributional assumptions on data values. Our work is motivated by the lack of comparative evaluation among the state-of-the-art algorithms using the same datasets on the same platform. We systematically evaluate the most recent algorithms for DODDS under various stream settings and outlier rates. Our extensive results show that in most settings, the MCOD algorithm offers the superior performance among all the algorithms, including the most recent algorithm Thresh\_LEAP.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mountantonakis:2016:MLC, author = "Michalis Mountantonakis and Yannis Tzitzikas", title = "On measuring the lattice of commonalities among several linked datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1101--1112", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994527", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A big number of datasets has been published according to the principles of Linked Data and this number keeps increasing. Although the ultimate objective is linking and integration, it is not currently evident how connected the current LOD cloud is. Measurements (and indexes) that involve more than two datasets are not available although they are important: (a) for obtaining complete information about one particular URI (or set of URIs) with provenance (b) for aiding dataset discovery and selection, (c) for assessing the connectivity between any set of datasets for quality checking and for monitoring their evolution over time, (d) for constructing visualizations that provide more informative overviews. Since it would be prohibitively expensive to perform all these measurements in a na{\"\i}ve way, in this paper we introduce indexes (and their construction algorithms) that can speedup such tasks. In brief, we introduce (i) a namespace-based prefix index, (ii) a sameAs catalog for computing the symmetric and transitive closure of the owl:sameAs relationships encountered in the datasets, (iii) a semantics-aware element index (that exploits the aforementioned indexes), and finally (iv) two lattice-based incremental algorithms for speeding up the computation of the intersection of URIs of any set of datasets. We discuss the speedup obtained by the introduced indexes and algorithms through comparative results and finally we report measurements about connectivity of the LOD cloud that have never been carried out so far.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chang:2016:ORD, author = "Zhao Chang and Dong Xie and Feifei Li", title = "Oblivious {RAM}: a dissection and experimental evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1113--1124", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994528", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many companies choose the cloud as their data and IT infrastructure platform. The remote access of the data brings the issue of trust. Despite the use of strong encryption schemes, adversaries can still learn valuable information regarding encrypted data by observing the data access patterns. To that end, one can hide the access patterns, which may leak sensitive information, using Oblivious RAMs (ORAMs). Numerous works have proposed different ORAM constructions, but they have never been thoroughly compared against and tested on large databases. There are also no open source implementation of these schemes. These limitations make it difficult for researchers and practitioners to choose and adopt a suitable ORAM for their applications. To address this issue, we provide a thorough study over several practical ORAM constructions, and implement them under the same library. We perform extensive experiments to provide insights into their performance characteristics with respect to efficiency, scalability, and communication cost.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kastrati:2016:OCP, author = "Fisnik Kastrati and Guido Moerkotte", title = "Optimization of conjunctive predicates for main memory column stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1125--1136", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994529", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Optimization of queries with conjunctive predicates for main memory databases remains a challenging task. The traditional way of optimizing this class of queries relies on predicate ordering based on selectivities or ranks. However, the optimization of queries with conjunctive predicates is a much more challenging task, requiring a holistic approach in view of (1) an accurate cost model that is aware of CPU architectural characteristics such as branch (mis)prediction, (2) a storage layer, allowing for a streamlined query execution, (3) a common subexpression elimination technique, minimizing column access costs, and (4) an optimization algorithm able to pick the optimal plan even in presence of a small (bounded) estimation error. In this work, we embrace the holistic approach, and show its superiority experimentally. Current approaches typically base their optimization algorithms on at least one of two assumptions: (1) the predicate selectivities are assumed to be independent, (2) the predicate costs are assumed to be constant. Our approach is not based on these assumptions, as they in general do not hold.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chothia:2016:EOM, author = "Zaheer Chothia and John Liagouris and Frank McSherry and Timothy Roscoe", title = "Explaining outputs in modern data analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1137--1148", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994530", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We report on the design and implementation of a general framework for interactively explaining the outputs of modern data-parallel computations, including iterative data analytics. To produce explanations, existing works adopt a naive backward tracing approach which runs into known issues; naive backward tracing may identify: (i) too much information that is difficult to process, and (ii) not enough information to reproduce the output, which hinders the logical debugging of the program. The contribution of this work is twofold. First, we provide methods to effectively reduce the size of explanations based on the first occurrence of a record in an iterative computation. Second, we provide a general method for identifying explanations that are sufficient to reproduce the target output in arbitrary computations --- a problem for which no viable solution existed until now. We implement our approach on differential dataflow, a modern high-throughput, low-latency dataflow platform. We add a small (but extensible) set of rules to explain each of its data-parallel operators, and we implement these rules as differential dataflow operators themselves. This choice allows our implementation to inherit the performance characteristics of differential dataflow, and results in a system that efficiently computes and updates explanatory inputs even as the inputs of the reference computation change. We evaluate our system with various analytic tasks on real datasets, and we show that it produces concise explanations in tens of milliseconds, while remaining faster --- up to two orders of magnitude --- than even the best implementations that do not support explanations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Buneman:2016:RGA, author = "Peter Buneman and Slawek Staworko", title = "{RDF} graph alignment with bisimulation", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1149--1160", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994531", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We investigate the problem of aligning two RDF databases, an essential problem in understanding the evolution of ontologies. Our approaches address three fundamental challenges: (1) the use of ``blank'' (null) names, (2) ontology changes in which different names are used to identify the same entity, and (3) small changes in the data values as well as small changes in the graph structure of the RDF database. We propose approaches inspired by the classical notion of graph bisimulation and extend them to capture the natural metrics of edit distance on the data values and the graph structure. We evaluate our methods on three evolving curated data sets. Overall, our results show that the proposed methods perform well and are scalable.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bursztyn:2016:TRA, author = "Damian Bursztyn and Fran{\c{c}}ois Goasdou{\'e} and Ioana Manolescu", title = "Teaching an {RDBMS} about ontological constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1161--1172", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994532", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the presence of an ontology, query answers must reflect not only data explicitly present in the database, but also implicit data, which holds due to the ontology, even though it is not present in the database. A large and useful set of ontology languages enjoys FOL reducibility of query answering: answering a query can be reduced to evaluating a certain first-order logic (FOL) formula (obtained from the query and ontology) against only the explicit facts. We present a novel query optimization framework for ontology-based data access settings enjoying FOL reducibility. Our framework is based on searching within a set of alternative equivalent FOL queries, i.e., FOL reformulations, one with minimal evaluation cost when evaluated through a relational database system. We apply this framework to the DL-Lite$_R$ Description Logic underpinning the W3C's OWL2 QL ontology language, and demonstrate through experiments its performance benefits when two leading SQL systems, one open-source and one commercial, are used for evaluating the FOL query reformulations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Simonini:2016:BLS, author = "Giovanni Simonini and Sonia Bergamaschi and H. V. Jagadish", title = "{BLAST}: a loosely schema-aware meta-blocking approach for entity resolution", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1173--1184", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994533", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Identifying records that refer to the same entity is a fundamental step for data integration. Since it is prohibitively expensive to compare every pair of records, blocking techniques are typically employed to reduce the complexity of this task. These techniques partition records into blocks and limit the comparison to records co-occurring in a block. Generally, to deal with highly heterogeneous and noisy data (e.g. semi-structured data of the Web), these techniques rely on redundancy to reduce the chance of missing matches. Meta-blocking is the task of restructuring blocks generated by redundancy-based blocking techniques, removing superfluous comparisons. Existing meta-blocking approaches rely exclusively on schema-agnostic features. In this paper, we demonstrate how ``loose'' schema information (i.e., statistics collected directly from the data) can be exploited to enhance the quality of the blocks in a holistic loosely schema-aware (meta-)blocking approach that can be used to speed up your favorite Entity Resolution algorithm. We call it B last (Blocking with Loosely-Aware Schema Techniques). We show how Blast can automatically extract this loose information by adopting a LSH-based step for efficiently scaling to large datasets. We experimentally demonstrate, on real-world datasets, how Blast outperforms the state-of-the-art unsupervised meta-blocking approaches, and, in many cases, also the supervised one.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhu:2016:LEI, author = "Erkang Zhu and Fatemeh Nargesian and Ken Q. Pu and Ren{\'e}e J. Miller", title = "{LSH} ensemble: {Internet}-scale domain search", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1185--1196", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994534", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the problem of domain search where a domain is a set of distinct values from an unspecified universe. We use Jaccard set containment score, defined as $ | Q \cap X | / | Q | $, as the measure of relevance of a domain $X$ to a query domain $Q$. Our choice of Jaccard set containment over Jaccard similarity as a measure of relevance makes our work particularly suitable for searching Open Data and data on the web, as Jaccard similarity is known to have poor performance over sets with large differences in their domain sizes. We demonstrate that the domains found in several real-life Open Data and web data repositories show a power-law distribution over their domain sizes. We present a new index structure, Locality Sensitive Hashing (LSH) Ensemble, that solves the domain search problem using set containment at Internet scale. Our index structure and search algorithm cope with the data volume and skew by means of data sketches using Minwise Hashing and domain partitioning. Our index structure does not assume a prescribed set of data values. We construct a cost model that describes the accuracy of LSH Ensemble with any given partitioning. This allows us to formulate the data partitioning for LSH Ensemble as an optimization problem. We prove that there exists an optimal partitioning for any data distribution. Furthermore, for datasets following a power-law distribution, as observed in Open Data and Web data corpora, we show that the optimal partitioning can be approximated using equi-depth, making it particularly efficient to use in practice. We evaluate our algorithm using real data (Canadian Open Data and WDC Web Tables) containing up over 262 million domains. The experiments demonstrate that our index consistently outperforms other leading alternatives in accuracy and performance. The improvements are most dramatic for data with large skew in the domain sizes. Even at 262 million domains, our index sustains query performance with under 3 seconds response time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Konda:2016:MTBa, author = "Pradap Konda and Sanjib Das and Paul Suganthan G. C. and AnHai Doan and Adel Ardalan and Jeffrey R. Ballard and Han Li and Fatemah Panahi and Haojun Zhang and Jeff Naughton and Shishir Prasad and Ganesh Krishnan and Rohit Deep and Vijay Raghavendra", title = "{Magellan}: toward building entity matching management systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1197--1208", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994535", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Entity matching (EM) has been a long-standing challenge in data management. Most current EM works focus only on developing matching algorithms. We argue that far more efforts should be devoted to building EM systems. We discuss the limitations of current EM systems, then present as a solution Magellan, a new kind of EM systems. Magellan is novel in four important aspects. (1) It provides how-to guides that tell users what to do in each EM scenario, step by step. (2) It provides tools to help users do these steps; the tools seek to cover the entire EM pipeline, not just matching and blocking as current EM systems do. (3) Tools are built on top of the data analysis and Big Data stacks in Python, allowing Magellan to borrow a rich set of capabilities in data cleaning, IE, visualization, learning, etc. (4) Magellan provides a powerful scripting environment to facilitate interactive experimentation and quick ``patching'' of the system. We describe research challenges raised by Magellan, then present extensive experiments with 44 students and users at several organizations that show the promise of the Magellan approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Saha:2016:AOD, author = "Diptikalyan Saha and Avrilia Floratou and Karthik Sankaranarayanan and Umar Farooq Minhas and Ashish R. Mittal and Fatma {\"O}zcan", title = "{ATHENA}: an ontology-driven system for natural language querying over relational data stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1209--1220", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994536", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we present ATHENA, an ontology-driven system for natural language querying of complex relational databases. Natural language interfaces to databases enable users easy access to data, without the need to learn a complex query language, such as SQL. ATHENA uses domain specific ontologies, which describe the semantic entities, and their relationships in a domain. We propose a unique two-stage approach, where the input natural language query (NLQ) is first translated into an intermediate query language over the ontology, called OQL, and subsequently translated into SQL. Our two-stage approach allows us to decouple the physical layout of the data in the relational store from the semantics of the query, providing physical independence. Moreover, ontologies provide richer semantic information, such as inheritance and membership relations, that are lost in a relational schema. By reasoning over the ontologies, our NLQ engine is able to accurately capture the user intent. We study the effectiveness of our approach using three different workloads on top of geographical (GEO), academic (MAS) and financial (FIN) data. ATHENA achieves 100\% precision on the GEO and MAS workloads, and 99\% precision on the FIN workload which operates on a complex financial ontology. Moreover, ATHENA attains 87.2\%, 88.3\%, and 88.9\% recall on the GEO, MAS, and FIN workloads, respectively.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wesley:2016:ICC, author = "Richard Wesley and Fei Xu", title = "Incremental computation of common windowed holistic aggregates", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1221--1232", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994537", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Windowed aggregates are a SQL 2003 feature for computing aggregates in moving windows. Common examples include cumulative sums, local maxima and moving quantiles. With the advent over the last few years of easy-to-use data analytics tools, these functions are becoming widely used by more and more analysts, but some aggregates (such as local maxima) are much easier to compute than others (such as moving quantiles). Nevertheless, aggregates that are more difficult to compute, like quantile and mode (or ``most frequent'') provide more appropriate statistical summaries in the common situation when a distribution is not Gaussian and are an essential part of a data analysis toolkit. Recent work has described highly efficient windowed implementations of the most common aggregate function categories, including distributive$^1$ aggregates such as cumulative sums and algebraic aggregates such as moving averages. But little has been published on either the implementation or the performance of the more complex holistic windowed aggregates such as moving quantiles. This paper provides the first in-depth study of how to efficiently implement the three most common holistic windowed aggregates (count distinct, mode and quantile) by reusing the aggregate state between consecutive frames. Our measurements show that these incremental algorithms generally achieve improvements of about 10x over na{\"\i}ve implementations, and that they can effectively detect when to reset the internal state during extreme frame variation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fang:2016:ECS, author = "Yixiang Fang and Reynold Cheng and Siqiang Luo and Jiafeng Hu", title = "Effective community search for large attributed graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "12", pages = "1233--1244", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/2994509.2994538", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 6 16:21:12 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a graph $G$ and a vertex $ q \in G$, the community search query returns a subgraph of $G$ that contains vertices related to $q$. Communities, which are prevalent in attributed graphs such as social networks and knowledge bases, can be used in emerging applications such as product advertisement and setting up of social events. In this paper, we investigate the attributed community query (or ACQ), which returns an attributed community (AC) for an attributed graph. The AC is a subgraph of $G$, which satisfies both structure cohesiveness (i.e., its vertices are tightly connected) and keyword cohesiveness (i.e., its vertices share common keywords). The AC enables a better understanding of how and why a community is formed (e.g., members of an AC have a common interest in music, because they all have the same keyword ``music''). An AC can be ``personalized''; for example, an ACQ user may specify that an AC returned should be related to some specific keywords like ``research'' and ``sports''. To enable efficient AC search, we develop the CL-tree index structure and three algorithms based on it. We evaluate our solutions on four large graphs, namely Flickr, DBLP, Tencent, and DBpedia. Our results show that ACs are more effective and efficient than existing community retrieval approaches. Moreover, an AC contains more precise and personalized information than that of existing community search and detection methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lang:2016:TIA, author = "Willis Lang and Karthik Ramachandra and David J. DeWitt and Shize Xu and Qun Guo and Ajay Kalhan and Peter Carlin", title = "Not for the timid: on the impact of aggressive over-booking in the cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1245--1256", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To lower hosting costs and service prices, database-as-a-service (DBaaS) providers strive to maximize cluster utilization without negatively affecting their users' service experience. Some of the most effective approaches for increasing service efficiency result in the over-booking of the cluster with user databases. For instance, one approach is to reclaim cluster capacity from a database when it is idle, temporarily re-using the capacity for some other purpose, and over-booking the cluster's resources. Such approaches are largely driven by policies that determine when it is prudent to temporarily reclaim capacity from an idle database. In this paper, we examine policies that inherently tune the system's idle sensitivity. Increased sensitivity to idleness leads to aggressive over-booking while the converse leads to conservative reclamation and lower utilization levels. Aggressive over-booking also incurs a ``reserve'' capacity cost (for when we suddenly ``owe'' capacity to previously idle databases.) We answer these key questions in this paper: (1) how to find a ``good'' resource reclamation policy for a given DBaaS cluster of users; and (2) how to forecast the needed near-term reserve capacity. To help us answer these questions, we used production user activity traces from Azure SQL DB and built models of an over-booking mechanism. We show that choosing the right policy can substantially boost the efficiency of the service, facilitating lower service prices via lower amortized infrastructure costs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sevenich:2016:UDS, author = "Martin Sevenich and Sungpack Hong and Oskar van Rest and Zhe Wu and Jayanta Banerjee and Hassan Chafi", title = "Using domain-specific languages for analytic graph databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1257--1268", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently graph has been drawing lots of attention both as a natural data model that captures fine-grained relationships between data entities and as a tool for powerful data analysis that considers such relationships. In this paper, we present a new graph database system that integrates a robust graph storage with an efficient graph analytics engine. Primarily, our system adopts two domain-specific languages (DSLs), one for describing graph analysis algorithms and the other for graph pattern matching queries. Compared to the API-based approaches in conventional graph processing systems, the DSL-based approach provides users with more flexible and intuitive ways of expressing algorithms and queries. Moreover, the DSL-based approach has significant performance benefits as well, (1) by skipping (remote) API invocation overhead and (2) by applying high-level optimization from the compiler.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2016:KLM, author = "Shaosu Liu and Bin Song and Sriharsha Gangam and Lawrence Lo and Khaled Elmeleegy", title = "{Kodiak}: leveraging materialized views for very low-latency analytics over high-dimensional web-scale data", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1269--1280", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Turn's online advertising campaigns produce petabytes of data. This data is composed of trillions of events, e.g. impressions, clicks, etc., spanning multiple years. In addition to a timestamp, each event includes hundreds of fields describing the user's attributes, campaign's attributes, attributes of where the ad was served, etc. Advertisers need advanced analytics to monitor their running campaigns' performance, as well as to optimize future campaigns. This involves slicing and dicing the data over tens of dimensions over arbitrary time ranges. Many of these queries need to power the web portal to provide reports and dashboards. For an interactive response time, they have to have tens of milliseconds latency. At Turn's scale of operations, no existing system was able to deliver this performance in a cost effective manner. Kodiak, a distributed analytical data platform for web-scale high-dimensional data, was built to serve this need. It relies on pre-computations to materialize thousands of views to serve these advanced queries. These views are partitioned and replicated across Kodiak's storage nodes for scalability and reliability. They are system maintained as new events arrive. At query time, the system auto-selects the most suitable view to serve each query. Kodiak has been used in production for over a year. It hosts 2490 views for over three petabytes of raw data serving over 200K queries daily. It has median and 99\% query latencies of 8 ms and 252 ms respectively. Our experiments show that its query latency is 3 orders of magnitude faster than leading big data platforms on head-to-head comparisons using Turn's query workload. Moreover, Kodiak uses 4 orders of magnitude less resources to run the same workload.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sharma:2016:GRT, author = "Aneesh Sharma and Jerry Jiang and Praveen Bommannavar and Brian Larson and Jimmy Lin", title = "{GraphJet}: real-time content recommendations at {Twitter}", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1281--1292", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents GraphJet, a new graph-based system for generating content recommendations at Twitter. As motivation, we trace the evolution of our formulation and approach to the graph recommendation problem, embodied in successive generations of systems. Two trends can be identified: supplementing batch with real-time processing and a broadening of the scope of recommendations from users to content. Both of these trends come together in Graph-Jet, an in-memory graph processing engine that maintains a real-time bipartite interaction graph between users and tweets. The storage engine implements a simple API, but one that is sufficiently expressive to support a range of recommendation algorithms based on random walks that we have refined over the years. Similar to Cassovary, a previous graph recommendation engine developed at Twitter, GraphJet assumes that the entire graph can be held in memory on a single server. The system organizes the interaction graph into temporally-partitioned index segments that hold adjacency lists. GraphJet is able to support rapid ingestion of edges while concurrently serving lookup queries through a combination of compact edge encoding and a dynamic memory allocation scheme that exploits power-law characteristics of the graph. Each GraphJet server ingests up to one million graph edges per second, and in steady state, computes up to 500 recommendations per second, which translates into several million edge read operations per second.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ma:2016:DFP, author = "Edward Ma and Vishrut Gupta and Meichun Hsu and Indrajit Roy", title = "\pkg{dmapply}: a functional primitive to express distributed machine learning algorithms in {R}", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1293--1304", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/s-plus.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Due to R's popularity as a data-mining tool, many distributed systems expose an R-based API to users who need to build a distributed application in R. As a result, data scientists have to learn to use different interfaces such as RHadoop, SparkR, Revolution R's ScaleR, and HPE's Distributed R. Unfortunately, these interfaces are custom, non-standard, and difficult to learn. Not surprisingly, R applications written in one framework do not work in another, and each backend infrastructure has spent redundant effort in implementing distributed machine learning algorithms. Working with the members of R-core, we have created ddR (Distributed Data structures in R), a unified system that works across different distributed frameworks. In ddR, we introduce a novel programming primitive called dmapply that executes functions on distributed data structures. The dmapply primitive encapsulates different computation patterns: from function and data broadcast to pair-wise communication. We show that dmapply is powerful enough to express algorithms that fit the statistical query model, which includes many popular machine learning algorithms, as well as applications written in MapReduce. We have integrated ddR with many backends, such as R's single-node parallel framework, multi-node SNOW framework, Spark, and HPE Distributed R, with few or no modifications to any of these systems. We have also implemented multiple machine learning algorithms which are not only portable across different distributed systems, but also have performance comparable to the ``native'' implementations on the backends. We believe that ddR will standardize distributed computing in R, just like the SQL interface has standardized how relational data is manipulated.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pedreira:2016:CIM, author = "Pedro Pedreira and Chris Croswhite and Luis Bona", title = "{Cubrick}: indexing millions of records per second for interactive analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1305--1316", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper describes the architecture and design of Cubrick, a distributed multidimensional in-memory DBMS suited for interactive analytics over highly dynamic datasets. Cubrick has a strictly multidimensional data model composed of cubes, dimensions and metrics, supporting sub-second OLAP operations such as slice and dice, roll-up and drill-down over terabytes of data. All data stored in Cubrick is range partitioned by every dimension and stored within containers called bricks in an unordered and sparse fashion, providing high data ingestion rates and indexed access through any combination of dimensions. In this paper, we describe details about Cubrick's internal data structures, distributed model, query execution engine and a few details about the current implementation. Finally, we present results from a thorough experimental evaluation that leveraged datasets and queries collected from a few internal Cubrick deployments at Facebook.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Iosup:2016:LGB, author = "Alexandru Iosup and Tim Hegeman and Wing Lung Ngai and Stijn Heldens and Arnau Prat-P{\'e}rez and Thomas Manhardto and Hassan Chafio and Mihai Capota and Narayanan Sundaram and Michael Anderson and Ilie Gabriel Tanase and Yinglong Xia and Lifeng Nai and Peter Boncz", title = "{LDBC} graphalytics: a benchmark for large-scale graph analysis on parallel and distributed platforms", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1317--1328", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper we introduce LDBC Graphalytics, a new industrial-grade benchmark for graph analysis platforms. It consists of six deterministic algorithms, standard datasets, synthetic dataset generators, and reference output, that enable the objective comparison of graph analysis platforms. Its test harness produces deep metrics that quantify multiple kinds of system scalability, such as horizontal/vertical and weak/strong, and of robustness, such as failures and performance variability. The benchmark comes with open-source software for generating data and monitoring performance. We describe and analyze six implementations of the benchmark (three from the community, three from the industry), providing insights into the strengths and weaknesses of the platforms. Key to our contribution, vendors perform the tuning and benchmarking of their platforms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lustosa:2016:DSS, author = "Hermano Lustosa and Fabio Porto and Patrick Valduriez and Pablo Blanco", title = "Database system support of simulation data", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1329--1340", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Supported by increasingly efficient HPC infra-structure, numerical simulations are rapidly expanding to fields such as oil and gas, medicine and meteorology. As simulations become more precise and cover longer periods of time, they may produce files with terabytes of data that need to be efficiently analyzed. In this paper, we investigate techniques for managing such data using an array DBMS. We take advantage of multidimensional arrays that nicely models the dimensions and variables used in numerical simulations. However, a naive approach to map simulation data files may lead to sparse arrays, impacting query response time, in particular, when the simulation uses irregular meshes to model its physical domain. We propose efficient techniques to map coordinate values in numerical simulations to evenly distributed cells in array chunks with the use of equi-depth histograms and space-filling curves. We implemented our techniques in SciDB and, through experiments over real-world data, compared them with two other approaches: row-store and column-store DBMS. The results indicate that multidimensional arrays and column-stores are much faster than a traditional row-store system for queries over a larger amount of simulation data. They also help identifying the scenarios where array DBMSs are most efficient, and those where they are outperformed by column-stores.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jacques-Silva:2016:CRG, author = "Gabriela Jacques-Silva and Fang Zheng and Daniel Debrunner and Kun-Lung Wu and Victor Dogaru and Eric Johnson and Michael Spicer and Ahmet Erdem Sariy{\"u}ce", title = "Consistent regions: guaranteed tuple processing in {IBM} streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1341--1352", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Guaranteed tuple processing has become critically important for many streaming applications. This paper describes how we enabled IBM Streams, an enterprise-grade stream processing system, to provide data processing guarantees. Our solution goes from language-level abstractions to a runtime protocol. As a result, with a couple of simple annotations at the source code level, IBM Streams developers can define consistent regions, allowing any subgraph of their streaming application to achieve guaranteed tuple processing. At runtime, a consistent region periodically executes a variation of the Chandy-Lamport snapshot algorithm to establish a consistent global state for that region. The coupling of consistent states with data replay enables guaranteed tuple processing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Al-Kateb:2016:HRC, author = "Mohammed Al-Kateb and Paul Sinclair and Grace Au and Carrie Ballinger", title = "Hybrid row-column partitioning in {Teradata\reg}", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1353--1364", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data partitioning is an indispensable ingredient of database systems due to the performance improvement it can bring to any given mixed workload. Data can be partitioned horizontally or vertically. While some commercial proprietary and open source database systems have one flavor or mixed flavors of these partitioning forms, Teradata Database offers a unique hybrid row-column store solution that seamlessly combines both of these partitioning schemes. The key feature of this hybrid solution is that either row, column, or combined partitions are all stored and handled in the same way internally by the underlying file system storage layer. In this paper, we present the main characteristics and explain the implementation approach of Teradata's row-column store. We also discuss query optimization techniques applicable specifically to partitioned tables. Furthermore, we present a performance study that demonstrates how different partitioning options impact the performance of various queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fernandes:2016:THH, author = "Ricardo Fernandes and Piotr Zaczkowski and Bernd G{\"o}ttler and Conor Ettinoffe and Anis Moussa", title = "{TrafficDB}: {HERE}'s high performance shared-memory data store", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1365--1376", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "HERE's traffic-aware services enable route planning and traffic visualisation on web, mobile and connected car applications. These services process thousands of requests per second and require efficient ways to access the information needed to provide a timely response to end-users. The characteristics of road traffic information and these traffic-aware services require storage solutions with specific performance features. A route planning application utilising traffic congestion information to calculate the optimal route from an origin to a destination might hit a database with millions of queries per second. However, existing storage solutions are not prepared to handle such volumes of concurrent read operations, as well as to provide the desired vertical scalability. This paper presents TrafficDB, a shared-memory data store, designed to provide high rates of read operations, enabling applications to directly access the data from memory. Our evaluation demonstrates that TrafficDB handles millions of read operations and provides near-linear scalability on multi-core machines, where additional processes can be spawned to increase the systems' throughput without a noticeable impact on the latency of querying the data store. The paper concludes with a description of how TrafficDB improved the performance of our traffic-aware services running in production.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Scotti:2016:CBH, author = "Alex Scotti and Mark Hannum and Michael Ponomarenko and Dorin Hogea and Akshat Sikarwar and Mohit Khullar and Adi Zaimi and James Leddy and Rivers Zhang and Fabio Angius and Lingzhi Deng", title = "{Comdb2}: {Bloomberg}'s highly available relational database system", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1377--1388", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Comdb2 is a distributed database system designed for geographical replication and high availability. In contrast with the latest trends in this field, Comdb2 offers full transactional support, a standard relational model, and the expressivity of SQL. Moreover, the system allows for rich stored procedures using a dialect of Lua. Comdb2 implements a serializable system in which reads from any node always return current values. Comdb2 provides transparent High Availability through built-in service discovery and sophisticated retry logic embedded in the standard API. In addition to the relational data model, Comdb2 implements queues for publisher-to-subscriber message delivery. Queues can be combined with table triggers for time-consistent log distribution, providing functionality commonly needed in modern OLTP. In this paper we give an overview of our last twelve years of work. We focus on the design choices that have made Comdb2 the primary database solution within our company, Bloomberg LP (BLP).", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Srinivasan:2016:AAR, author = "V. Srinivasan and Brian Bulkowski and Wei-Ling Chu and Sunil Sayyaparaju and Andrew Gooding and Rajkumar Iyer and Ashish Shinde and Thomas Lopatic", title = "{Aerospike}: architecture of a real-time operational {DBMS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1389--1400", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we describe the solutions developed to address key technical challenges encountered while building a distributed database system that can smoothly handle demanding real-time workloads and provide a high level of fault tolerance. Specifically, we describe schemes for the efficient clustering and data partitioning for the automatic scale out of processing across multiple nodes and for optimizing the usage of CPUs, DRAM, SSDs and networks to efficiently scale up performance on one node. The techniques described here were used to develop Aerospike (formerly Citrusleaf), a high performance distributed database system built to handle the needs of today's interactive online services. Most real-time decision systems that use Aerospike require very high scale and need to make decisions within a strict SLA by reading from, and writing to, a database containing billions of data items at a rate of millions of operations per second with sub-millisecond latency. For over five years, Aerospike has been continuously used in over a hundred successful production deployments, as many enterprises have discovered that it can substantially enhance their user experience.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2016:MQO, author = "Jack Chen and Samir Jindel and Robert Walzer and Rajkumar Sen and Nika Jimsheleishvilli and Michael Andrews", title = "The {MemSQL} query optimizer: a modern optimizer for real-time analytics in a distributed database", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1401--1412", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Real-time analytics on massive datasets has become a very common need in many enterprises. These applications require not only rapid data ingest, but also quick answers to analytical queries operating on the latest data. MemSQL is a distributed SQL database designed to exploit memory-optimized, scale-out architecture to enable real-time transactional and analytical workloads which are fast, highly concurrent, and extremely scalable. Many analytical queries in MemSQL's customer workloads are complex queries involving joins, aggregations, sub-queries, etc. over star and snowflake schemas, often ad-hoc or produced interactively by business intelligence tools. These queries often require latencies of seconds or less, and therefore require the optimizer to not only produce a high quality distributed execution plan, but also produce it fast enough so that optimization time does not become a bottleneck. In this paper, we describe the architecture of the MemSQL Query Optimizer and the design choices and innovations which enable it quickly produce highly efficient execution plans for complex distributed queries. We discuss how query rewrite decisions oblivious of distribution cost can lead to poor distributed execution plans, and argue that to choose high-quality plans in a distributed database, the optimizer needs to be distribution-aware in choosing join plans, applying query rewrites, and costing plans. We discuss methods to make join enumeration faster and more effective, such as a rewrite-based approach to exploit bushy joins in queries involving multiple star schemas without sacrificing optimization time. We demonstrate the effectiveness of the MemSQL optimizer over queries from the TPC-H benchmark and a real customer workload.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lakshman:2016:NFS, author = "Sarath Lakshman and Sriram Melkote and John Liang and Ravi Mayuram", title = "{Nitro}: a fast, scalable in-memory storage engine for {NoSQL} global secondary index", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1413--1424", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present Nitro, a high-performance in-memory key--value storage engine used in Couchbase 4.5 Global Secondary Indexes. The Nitro storage engine is well suited for the recent hardware trends like large amounts of memory and many CPU cores. The storage engine leverages latch-free data structures and tries to achieve linear scalability for the index read-write operations. The Nitro storage engine offers concurrent readers and writers, lightweight database snapshots, stable scan, backup and recovery operations. We integrated Nitro into the Couchbase Global Secondary Indexes (GSI) and observed significant improvement in performance compared to our disk oriented storage engine configured with the same amount of memory for buffer cache. On a 32 core machine, we observed an end-to-end GSI server insertion throughput of 1,650,000 entries/sec and index update throughput of 822,000 entries/sec. A single instance of Nitro data structure running on a 40 core machine achieved a peak insertion throughput of 4 million index entries/sec and entry lookup throughput of 10 million lookups/sec.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Boehm:2016:SDM, author = "Matthias Boehm and Michael W. Dusenberry and Deron Eriksson and Alexandre V. Evfimievski and Faraz Makari Manshadi and Niketan Pansare and Berthold Reinwald and Frederick R. Reiss and Prithviraj Sen and Arvind C. Surve and Shirish Tatikonda", title = "{SystemML}: declarative machine learning on spark", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1425--1436", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The rising need for custom machine learning (ML) algorithms and the growing data sizes that require the exploitation of distributed, data-parallel frameworks such as MapReduce or Spark, pose significant productivity challenges to data scientists. Apache SystemML addresses these challenges through declarative ML by (1) increasing the productivity of data scientists as they are able to express custom algorithms in a familiar domain-specific language covering linear algebra primitives and statistical functions, and (2) transparently running these ML algorithms on distributed, data-parallel frameworks by applying cost-based compilation techniques to generate efficient, low-level execution plans with in-memory single-node and large-scale distributed operations. This paper describes SystemML on Apache Spark, end to end, including insights into various optimizer and runtime techniques as well as performance characteristics. We also share lessons learned from porting SystemML to Spark and declarative ML in general. Finally, SystemML is open-source, which allows the database community to leverage it as a testbed for further research.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mishra:2016:AAD, author = "Aurosish Mishra and Shasank Chavan and Allison Holloway and Tirthankar Lahiri and Zhen Hua Liu and Sunil Chakkappen and Dennis Lui and Vinita Subramanian and Ramesh Kumar and Maria Colgan and Jesse Kamp and Niloy Mukherjee and Vineet Marwah", title = "Accelerating analytics with dynamic in-memory expressions", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1437--1448", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Oracle Database In-Memory (DBIM) accelerates analytic workload performance by orders of magnitude through an in-memory columnar format utilizing techniques such as SIMD vector processing, in-memory storage indexes, and optimized predicate evaluation and aggregation. With Oracle Database 12.2, Database In-Memory is further enhanced to accelerate analytic processing through a novel lightweight mechanism known as Dynamic In-Memory Expressions (DIMEs). The DIME mechanism automatically detects frequently occurring expressions in a query workload, and then creates highly optimized, transactionally consistent, in-memory columnar representations of these expression results. At runtime, queries can directly access these DIMEs, thus avoiding costly expression evaluations. Furthermore, all the optimizations introduced in DBIM can apply directly to DIMEs. Since DIMEs are purely in-memory structures, no changes are required to the underlying tables. We show that DIMEs can reduce query elapsed times by several orders of magnitude without the need for costly pre-computed structures such as computed columns or materialized views or cubes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bhadange:2016:GSL, author = "Satyajit Bhadange and Akhil Arora and Arnab Bhattacharya", title = "{GARUDA}: a system for large-scale mining of statistically significant connected subgraphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1449--1452", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Unraveling ``interesting'' subgraphs corresponding to disease/crime hotspots or characterizing habitation shift patterns is an important graph mining task. With the availability and growth of large-scale real-world graphs, mining for such subgraphs has become the need of the hour for graph miners as well as non-technical end-users. In this demo, we present GARUDA, a system capable of mining large-scale graphs for statistically significant subgraphs in a scalable manner, and provide: (1) a detailed description of the various features and user-friendly GUI of GARUDA; (2) a brief description of the system architecture; and (3) a demonstration scenario for the audience. The demonstration showcases one real graph mining task as well as its ability to scale to large real graphs, portraying speed-ups of upto 8--10 times over the state-of-the-art MSCS algorithm.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2016:VVT, author = "Huan Li and Hua Lu and Xin Chen and Gang Chen and Ke Chen and Lidan Shou", title = "{Vita}: a versatile toolkit for generating indoor mobility data for real-world buildings", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1453--1456", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate a generic, user-configurable toolkit for generating different types of indoor mobility data for real-world buildings. Our prototype generates the desired data in a three-layer pipeline. The Infrastructure Layer accepts industry-standard digital building information (DBI) files to generate the host indoor environment, allowing users to configure the generation of a variety of positioning devices, such as Wi-Fi, Bluetooth, RFID, etc. The Moving Object Layer offers the functionality of defining objects or trajectories, with configurable indoor moving patterns, distribution models, and sampling frequencies. The Positioning Layer generates synthetic signal strength measurements known as raw RSSI$^1$ measurements according to the positioning device data and trajectory data generated at relevant layers. It also generates different types of indoor positioning data through the customization of all typical indoor positioning methods on the raw RSSI data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bagan:2016:GFW, author = "Guillaume Bagan and Angela Bonifati and Radu Ciucanu and George H. L. Fletcher and Aur{\'e}lien Lemay and Nicky Advokaat", title = "Generating flexible workloads for graph databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1457--1460", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph data management tools are nowadays evolving at a great pace. Key drivers of progress in the design and study of data intensive systems are solutions for synthetic generation of data and workloads, for use in empirical studies. Current graph generators, however, provide limited or no support for workload generation or are limited to fixed use-cases. Towards addressing these limitations, we demonstrate gMark, the first domain- and query language-independent framework for synthetic graph and query workload generation. Its novel features are: (i) fine-grained control of graph instance and query workload generation via expressive user-defined schemas; (ii) the support of expressive graph query languages, including recursion among other features; and, (iii) selectivity estimation of the generated queries. During the demonstration, we will showcase the highly tunable generation of graphs and queries through various user-defined schemas and targeted selectivities, and the variety of supported practical graph query languages. We will also show a performance comparison of four state-of-the-art graph database engines, which helps us understand their current strengths and desirable future extensions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhou:2016:AQP, author = "Xiaofeng Zhou and Yang Chen and Daisy Zhe Wang", title = "{ArchimedesOne}: query processing over probabilistic knowledge bases", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1461--1464", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Knowledge bases are becoming increasingly important in structuring and representing information from the web. Meanwhile, web-scale information poses significant scalability and quality challenges to knowledge base systems. To address these challenges, we develop a probabilistic knowledge base system, ArchimedesOne, by scaling up the knowledge expansion and statistical inference algorithms. We design a web interface for users to query and update large knowledge bases. In this paper, we demonstrate the ArchimedesOne system to showcase its efficient query and inference engines. The demonstration serves two purposes: (1) to provide an interface for users to interact with ArchimedesOne through load, search, and update queries; and (2) to validate our approaches of knowledge expansion by applying inference rules in batches using relational operations and query-driven inference by focusing computation on the query facts. We compare ArchimedesOne with state-of-the-art approaches using two knowledge bases: NELL-sports with 4.5 million facts and Reverb-Sherlock with 15 million facts.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Milo:2016:RIR, author = "Tova Milo and Slava Novgorodov and Wang-Chiew Tan", title = "{Rudolf}: interactive rule refinement system for fraud detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1465--1468", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Credit card frauds are unauthorized transactions that are made or attempted by a person or an organization that is not authorized by the card holders. In addition to machine learning-based techniques, credit card companies often employ domain experts to manually specify rules that exploit domain knowledge for improving the detection process. Over time, however, as new (fraudulent and legitimate) transaction arrive, these rules need to be updated and refined to capture the evolving (fraud and legitimate) activity patterns. The goal of the RUDOLF system that is demonstrated here is to guide and assist domain experts in this challenging task. RUDOLF automatically determines a best set of candidate adaptations to existing rules to capture all fraudulent transactions and, respectively, omit all legitimate transactions. The proposed modifications can then be further refined by domain experts based on their domain knowledge, and the process can be repeated until the experts are satisfied with the resulting rules. Our experimental results on real-life datasets demonstrate the effectiveness and efficiency of our approach. We showcase RUDOLF with two demonstration scenarios: detecting credit card frauds and network attacks. Our demonstration will engage the VLDB audience by allowing them to play the role of a security expert, a credit card fraudster, or a network attacker.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Maccioni:2016:GDB, author = "Antonio Maccioni and Matteo Collina", title = "Graph databases in the browser: using {LevelGraph} to explore {New Delhi}", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1469--1472", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The pervasiveness of graphs on the Web is growing; however, the difficulty of managing complex graph structures curbs the development of web-oriented applications that embed network data. The open source project, LevelGraph, aims to overcome the obstacles that web developers face with graph data management. LevelGraph is an easy-to-use graph database layer for web applications. To demonstrate various capabilities of the system, we developed a web-based application that utilizes a graph database of a tourist network in New Delhi. The application allows users to move around the city while LevelGraph executes graph queries on the underlying database. In this demonstration, we show how LevelGraph's features facilitate development and maintenance of web applications that embed graph data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sellam:2016:ZCQ, author = "Thibault Sellam and Martin Kersten", title = "{Ziggy}: characterizing query results for data explorers", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1473--1476", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data exploration has received much attention during the last few years. The aim is to learn interesting new facts from a possibly unfamiliar data set. Typically, explorers operate by trial and error: they write a query, inspect the results and refine their specifications accordingly. In this demo proposal, we present Ziggy, a system to help them understand their query results. Ziggy's aim is to complement an existing exploration system. It assumes that users already have a query in mind, but they do not know what is interesting about it. To assist them, it detects characteristic views, that is, small sets of columns on which the tuples in the results are different from those in the rest of the database. Thanks to these views, our explorers can understand why their selection is unique and make more informed exploration decisions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sellam:2016:BMN, author = "Thibault Sellam and Robin Cijvat and Richard Koopmanschap and Martin Kersten", title = "{Blaeu}: mapping and navigating large tables with cluster analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1477--1480", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Blaeu is an interactive database exploration tool. Its aim is to guide casual users through large data tables, ultimately triggering insights and serendipity. To do so, it relies on a double cluster analysis mechanism. It clusters the data vertically: it detects themes, groups of mutually dependent columns that highlight one aspect of the data. Then it clusters the data horizontally. For each theme, it produces a data map, an interactive visualization of the clusters in the table. The data maps summarize the data. They provide a visual synopsis of the clusters, as well as facilities to inspect their content and annotate them. But they also let the users navigate further. Our explorers can change the active set of columns or drill down into the clusters to refine their selection. Our prototype is fully operational, ready to deliver insights from complex databases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{El-Roby:2016:SQR, author = "Ahmed El-Roby and Khaled Ammar and Ashraf Aboulnaga and Jimmy Lin", title = "{Sapphire}: querying {RDF} data made simple", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1481--1484", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "There is currently a large amount of publicly accessible structured data available as RDF data sets. For example, the Linked Open Data (LOD) cloud now consists of thousands of RDF data sets with over 30 billion triples, and the number and size of the data sets is continuously growing. Many of the data sets in the LOD cloud provide public SPARQL endpoints to allow issuing queries over them. These end-points enable users to retrieve data using precise and highly expressive SPARQL queries. However, in order to do so, the user must have sufficient knowledge about the data sets that she wishes to query, that is, the structure of data, the vocabulary used within the data set, the exact values of literals, their data types, etc. Thus, while SPARQL is powerful, it is not easy to use. An alternative to SPARQL that does not require as much prior knowledge of the data is some form of keyword search over the structured data. Keyword search queries are easy to use, but inherently ambiguous in describing structured queries. This demonstration introduces Sapphire, a system for querying RDF data that strikes a middle ground between ambiguous keyword search and difficult-to-use SPARQL. Our system does not replace either, but utilizes both where they are most effective. Sapphire helps the user construct expressive SPARQL queries that represent her information needs without requiring detailed knowledge about the queried data sets. These queries are then executed over public SPARQL endpoints from the LOD cloud. Sapphire guides the user in the query writing process by showing suggestions of query terms based on the queried data, and by recommending changes to the query based on a predictive user model.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Amsterdamer:2016:DDT, author = "Yael Amsterdamer and Tova Milo and Amit Somech and Brit Youngmann", title = "{December}: a declarative tool for crowd member selection", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1485--1488", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Adequate crowd selection is an important factor in the success of crowdsourcing platforms, increasing the quality and relevance of crowd answers and their performance in different tasks. The optimal crowd selection can greatly vary depending on properties of the crowd and of the task. To this end, we present December, a declarative platform with novel capabilities for flexible crowd selection. December supports the personalized selection of crowd members via a dedicated query language Member-QL. This language enables specifying and combining common crowd selection criteria such as properties of a crowd member's profile and history, similarity between profiles in specific aspects and relevance of the member to a given task. This holistic, customizable approach differs from previous work that has mostly focused on dedicated algorithms for crowd selection in specific settings. To allow efficient query execution, we implement novel algorithms in December based on our generic, semantically-aware definitions of crowd member similarity and expertise. We demonstrate the effectiveness of December and Member-QL by using the VLDB community as crowd members and allowing conference participants to choose from among these members for different purposes and in different contexts.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{He:2016:DVV, author = "Xi He and Nisarg Raval and Ashwin Machanavajjhala", title = "A demonstration of {VisDPT}: visual exploration of differentially private trajectories", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1489--1492", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The release of detailed taxi trips has motivated numerous useful studies, but has also triggered multiple privacy attacks on individuals' trips. Despite these attacks, no tools are available for systematically analyzing the privacy risk of released trajectory data. While, recent studies have proposed mechanisms to publish synthetic mobility data with provable privacy guarantees, the questions on --- (1) how to explain the theoretical privacy guarantee to non-privacy experts; and (2) how well private data preserves the properties of ground truth, remain unclear. To address these issues, we propose a system --- VisDPT that provides rich visualization of sensitive information in trajectory databases and helps data curators understand the impact on utility due to privacy preserving mechanisms. We believe VisDPT will enable data curators to take informed decisions while publishing sanitized data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Scheuer:2016:JSA, author = "Tobias Scheuer and Norman May and Alexander B{\"o}hm and Daniel Scheibli", title = "{JexLog}: a sonar for the abyss", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1493--1496", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Today's hardware architectures provide an ever-increasing number of CPU cores that can be used for running concurrent operations. A big challenge is to ensure that these operations are properly synchronized and make efficient use of the available resources. Fellow database researchers have appropriately described this problem as ``staring into the abyss'' of complexity [12], where reasoning about the interplay of jobs on a thousand cores becomes extremely challenging. In this demonstration, we show how a new tool, JexLog, can help to visually analyze concurrent jobs in system software and how it is used to optimize for modern hardware.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ikeda:2016:CCC, author = "Kosetsu Ikeda and Atsuyuki Morishima and Habibur Rahman and Senjuti Basu Roy and Saravanan Thirumuruganathan and Sihem Amer-Yahia and Gautam Das", title = "Collaborative crowdsourcing with {Crowd4u}", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1497--1500", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Collaborative crowdsourcing is an emerging paradigm where a set of workers, often with diverse and complementary skills, form groups and work together to complete complex tasks. While crowdsourcing has been used successfully in many applications, collaboration is essential for achieving a high quality outcome for a number of emerging applications such as text translation, citizen journalism and surveillance tasks. However, no crowdsourcing platform today enables the end-to-end deployment of collaborative tasks. We demonstrate Crowd4U, a volunteer-based system that enables the deployment of diverse crowdsourcing tasks with complex data-flows, in a declarative manner. In addition to treating workers and tasks as rich entities, Crowd4U also provides an easy-to-use form-based task UI. Crowd4U implements worker-to-task assignment algorithms that are appropriate for each kind of task. Once workers are assigned to tasks, appropriate worker collaboration schemes are enforced in order to enable effective result coordination.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2016:YWQ, author = "Lei Chen and Jianliang Xu and Christian S. Jensen and Yafei Li", title = "{YASK}: a why-not question answering engine for spatial keyword query services", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1501--1504", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the proliferation of the mobile use of the web, spatial keyword query (SKQ) services are gaining in importance. However, state-of-the-art SKQ systems do not provide systematic functionality that allows users to ask why some known object is unexpectedly missing from a query result and do not provide an explanation for such missing objects. In this demonstration, we present a system called YASK, a whY-not question Answering engine for Spatial Keyword query services, that is capable of answering why-not questions posed in response to answers to spatial keyword top-$k$ queries. Two explanation and query refinement models, namely preference adjustment and keyword adaption, are implemented in YASK. The system provides users not only with the reasons why desired objects are missing from query results, but provides also relevant refined queries that revive the expected but missing objects. This demonstration gives attendees hands-on experience with YASK through a map-based GUI interface in which attendees can issue spatial keyword queries, pose why-not questions, and visualize the results.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yi:2016:AVQ, author = "Peipei Yi and Byron Choi and Sourav S. Bhowmick and Jianliang Xu", title = "{AutoG}: a visual query autocompletion framework for graph databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1505--1508", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Composing queries is evidently a tedious task. This is particularly true of graph queries as they are typically complex and prone to errors, compounded by the fact that graph schemas can be missing or too loose to be helpful for query formulation. Despite the great success of query formulation aids, in particular, automatic query completion, graph query autocompletion has received much less research attention. In this demonstration, we present a novel interactive visual subgraph query autocompletion framework called AutoG which alleviates the potentially painstaking task of graph query formulation. Specifically, given a large collection of small or medium-sized graphs and a visual query fragment q formulated by a user, AutoG returns top-$k$ query suggestions $ Q'$ as output at interactive time. Users may choose a query from $ Q'$ and iteratively apply AutoG to compose their queries. We demonstrate various features of AutoG and its superior ability to generate high quality suggestions to aid visual subgraph query formulation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Miao:2016:SPR, author = "Xiaoye Miao and Yunjun Gao and Gang Chen and Huiyong Cui and Chong Guo and Weida Pan", title = "{Si$^2$ p}: a restaurant recommendation system using preference queries over incomplete information", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1509--1512", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The incomplete data is universal in many real-life applications due to data integration, the limitation of devices, etc. In this demonstration, we present Si$^2$ p, a restaurant recommendation System with Preference queries on Incomplete Information. Si$^2$ p is capable of friendly recommending desirable restaurants based on preference queries that take the incomplete ratings information into consideration. It adopts the browser-server model, and incorporates three functionality modules including friendly and convenient query submission, flexible and useful result explanation, timely and incremental dataset interaction. Si$^2$ p provides the server side based on an extended PostgreSQL database that integrates two types of preference queries, namely, skyline and top-$k$ dominating queries over incomplete data. It also offers the browser-based interface for the users to interact with the system. Using a real restaurant dataset from TripAdvisor, we demonstrate Si$^2$ p can recommend and explore the restaurants in a friendly way.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bonaque:2016:MIQ, author = "R. Bonaque and T. D. Cao and B. Cautis and F. Goasdou{\'e} and J. Letelier and I. Manolescu and O. Mendoza and S. Ribeiro and X. Tannier", title = "Mixed-instance querying: a lightweight integration architecture for data journalism", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1513--1516", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As the world's affairs get increasingly more digital, timely production and consumption of news require to efficiently and quickly exploit heterogeneous data sources. Discussions with journalists revealed that content management tools currently at their disposal fall very short of expectations. We demonstrate Tatooine, a lightweight data integration prototype, which allows to quickly set up integration queries across (very) heterogeneous data sources, capitalizing on the many data links (joins) available in this application domain. Our demonstration is based on scenarios we study in collaboration with Le Monde, France's major newspaper.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Butterstein:2016:PPS, author = "Dennis Butterstein and Torsten Grust", title = "Precision performance surgery for {CostgreSQL}: {LLVM}-based Expression Compilation, Just in Time", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1517--1520", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate how the compilation of SQL expressions into machine code leads to significant query runtime improvements in PostgreSQL 9. Our primary goal is to connect recent research in query code generation with one of the most widely deployed database engines. The approach calls on LLVM to translate arithmetic and filter expressions into native x86 instructions just before SQL query execution begins. We deliberately follow a non-invasive design that does not turn PostgreSQL on its head: interpreted and compiled expression evaluation coexist and both are used to execute the same query. We will bring an enhanced version of PostgreSQL that exhibits notable runtime savings and provides visual insight into exactly where and how execution plans can benefit from SQL expression compilation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yahya:2016:EQE, author = "Mohamed Yahya and Klaus Berberich and Maya Ramanath and Gerhard Weikum", title = "Exploratory querying of extended knowledge graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1521--1524", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Knowledge graphs (KGs) are important assets for search, analytics, and recommendations. However, querying a KG to explore entities and discover facts is difficult and tedious, even for users with skills in SPARQL. First, users are not familiar with the structure and labels of entities, classes and relations. Second, KGs are bound to be incomplete, as they capture only major facts about entities and their relationships and miss out on many of the more subtle aspects. We demonstrate TriniT, a system that facilitates exploratory querying of large KGs, by addressing these issues of ``vocabulary'' mismatch and KG incompleteness. TriniT supports query relaxation rules that are invoked to allow for relevant answers which are not found otherwise. The incompleteness issue is addressed by extending a KG with additional text-style token triples obtained by running Open IE on Web and text sources. The query language, relaxation methods, and answer ranking are extended appropriately. The demo shows automatic query relaxation and has support for interactively adding user-customized relaxations. In both situations, the demo provides answer explanations and offers additional query suggestions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Panev:2016:EDR, author = "Kiril Panev and Sebastian Michel and Evica Milchevski and Koninika Pal", title = "Exploring databases via reverse engineering ranking queries with {PALEO}", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1525--1528", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A novel approach to explore databases using ranked lists is demonstrated. Working with ranked lists, capturing the relative performance of entities, is a very intuitive and widely applicable concept. Users can post lists of entities for which explanatory SQL queries and full result lists are returned. By refining the input, the results, or the queries, user can interactively explore the database content. The demonstrated system is centered around our PALEO framework for reverse engineering OLAP-style database queries and novel work on mining interesting categorical attributes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bespinyowong:2016:EER, author = "Ramon Bespinyowong and Wei Chen and H. V. Jagadish and Yuxin Ma", title = "{ExRank}: an exploratory ranking interface", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1529--1532", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Even with simple everyday tasks like online shopping or choosing a restaurant, users are easily overwhelmed with the large number of choices available today, each with a large number of inter-related attributes. We present ExRank, an interactive interface for exploring data that helps users understand the relationship between attribute values and find interesting items in the dataset. Based on a kNN graph and a PageRank algorithm, ExRank suggests which attributes the user should look at, and how expressed choices in particular attributes affect the distribution of values in other attributes for candidate objects. It solves the problem of empty result by showing similar items and when there are too many results, it ranks the data for the user. This demo consists of (1) the description of the software architecture and the user interface (2) the logic and reason behind our solution and (3) a list of demonstration scenarios for showing to the audience.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Diaz:2016:SQR, author = "Gonzalo Diaz and Marcelo Arenas and Michael Benedikt", title = "{SPARQLByE}: querying {RDF} data by example", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1533--1536", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Semantic Web technologies such as RDF and its query language, SPARQL, offer the possibility of opening up the use of public datasets to a great variety of ordinary users. But a key obstacle to the use of open data is the unfamiliarity of users with the structure of data or with SPARQL. To deal with these issues, we introduce a system for querying RDF data by example. At its core is a technique for reverse-engineering SPARQL queries by example. We demonstrate how reverse engineering along with other techniques, such as query relaxation, enables our system, SPARQLByE, to guide users who are unfamiliar with both the dataset and with SPARQL to the desired query and result set.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deutch:2016:NNL, author = "Daniel Deutch and Nave Frost and Amir Gilad", title = "{NLProv}: natural language provenance", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1537--1540", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose to present NLProv: an end-to-end Natural Language (NL) interface for database queries. Previous work has focused on interfaces for specifying NL questions, which are then compiled into queries in a formal language (e.g. SQL). We build upon this work, but focus on presenting a detailed form of the answers in Natural Language. The answers that we present are importantly based on the provenance of tuples in the query result, detailing not only which are the results but also their explanations. We develop a novel method for transforming provenance information to NL, by leveraging the original NL question structure. Furthermore, since provenance information is typically large, we present two solutions for its effective presentation as NL text: one that is based on provenance factorization with novel desiderata relevant to the NL case, and one that is based on summarization.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chandra:2016:PMA, author = "Bikash Chandra and Mathew Joseph and Bharath Radhakrishnan and Shreevidhya Acharya and S. Sudarshan", title = "Partial marking for automated grading of {SQL} queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1541--1544", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The XData system, currently being developed at IIT Bombay, provides an automated and interactive platform for grading student SQL queries, as well as for learning SQL. Prior work on the XData system focused on generating query specific test cases to catch common errors in queries. These test cases are used to check whether the student queries are correct or not. For grading student assignments, it is usually not sufficient to just check if a query is correct: if the query is incorrect, partial marks may need to be given, depending on how close the query is to being correct. In this paper, we extend the XData system by adding features that enable awarding of partial marks to incorrect student queries. Our system is able to go beyond numerous syntactic features when comparing a student query with a correct query. These features of our grading system allow the grading of SQL queries to be fully automated, and scalable to even large class sizes such as those of MOOCs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhao:2016:TPM, author = "Kaiqi Zhao and Yiding Liu and Quan Yuan and Lisi Chen and Zhida Chen and Gao Cong", title = "Towards personalized maps: mining user preferences from geo-textual data", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1545--1548", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Rich geo-textual data is available online and the data keeps increasing at a high speed. We propose two user behavior models to learn several types of user preferences from geo-textual data, and a prototype system on top of the user preference models for mining and search geo-textual data (called PreMiner) to support personalized maps. Different from existing recommender systems and data analysis systems, PreMiner highly personalizes user experience on maps and supports several applications, including user mobility \& interests mining, opinion mining in regions, user recommendation, point-of-interest recommendation, and querying and subscribing on geo-textual data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Feng:2016:SRS, author = "Kaiyu Feng and Kaiqi Zhao and Yiding Liu and Gao Cong", title = "A system for region search and exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1549--1552", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the increasing popularity of mobile devices and location based services, massive amount of geo-textual data (e.g., geo-tagged tweets) is being generated everyday. Compared with traditional spatial data, the textual dimension of geo-textual data greatly enriches the data. Meanwhile, the spatial dimension of geo-textual data also adds a semantically rich new aspect to textual data. The large volume, together with its rich semantics, calls for the need for data exploration. First, it has many applications to retrieve a region for exploration that satisfies user-specified conditions (e.g., the size and shape of the region) while maximizing some other conditions (e.g., the relevance to the query keywords of the objects in the region). Second, it is useful to mine and explore the topics of the geo-textual data within a (specified or retrieved) region and perhaps a timespan. This demonstration proposal presents the main ideas of our system, the Region Search and Exploration System (RISE), for efficiently supporting region search and exploration, and our demonstration plan.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Vitorovic:2016:SSR, author = "Aleksandar Vitorovic and Mohammed Elseidy and Khayyam Guliyev and Khue Vu Minh and Daniel Espino and Mohammad Dashti and Yannis Klonatos and Christoph Koch", title = "{Squall}: scalable real-time analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1553--1556", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Squall is a scalable online query engine that runs complex analytics in a cluster using skew-resilient, adaptive operators. Squall builds on state-of-the-art partitioning schemes and local algorithms, including some of our own. This paper presents the overview of Squall, including some novel join operators. The paper also presents lessons learned over the five years of working on this system, and outlines the plan for the proposed system demonstration.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Khurana:2016:GBE, author = "Udayan Khurana and Srinivasan Parthasarathy and Deepak Turaga", title = "Graph-based exploration of non-graph datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1557--1560", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graphs or networks provide a powerful abstraction to view and analyze relationships among different entities present in a dataset. However, much of the data of interest to analysts and data scientists resides in non-graph forms such as relational databases, JSON, XML, CSV and text. The effort and skill required in identifying and extracting the relevant graph representation from data is often the prohibitive and limits a wider adoption of graph-based analysis of non-graph data. In this paper, we demonstrate our system called GraphViewer, for accelerated graph-based exploration and analysis. It automatically discovers relevant graphs implicit within a given non-graph dataset using a set of novel rule-based and data-driven techniques, and optimizes their extraction and storage. It computes several node and graph level metrics and detects anomalous entities in data. Finally, it summarizes the results to support interpretation by a human analyst. While the system automates the computationally intensive aspects of the process, it is engineered to leverage human domain expertise and instincts to fine tune the data exploration process.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2016:RDF, author = "Minjian Liu and Qing Wang", title = "{Rogas}: a declarative framework for network analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1561--1564", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Network analytics has become increasingly popular in recent years. Various graph systems have been developed for analysing networks, while network data is still largely stored and managed in relational database systems in the first place. As two separate systems are often used to manage and analyse network data, it not only increases the difficulty for users to learn and maintain these different systems simultaneously, but also impedes performing more sophisticated analysis on relational and topological properties of network data. Aiming to tackle these issues, we present Rogas in this paper, which is a declarative framework that allows the user to formulate analysis queries naturally without thinking about the tedious implementation details of graph algorithms and query processing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tang:2016:LDM, author = "Mingjie Tang and Yongyang Yu and Qutaibah M. Malluhi and Mourad Ouzzani and Walid G. Aref", title = "{LocationSpark}: a distributed in-memory data management system for big spatial data", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1565--1568", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present LocationSpark, a spatial data processing system built on top of Apache Spark, a widely used distributed data processing system. LocationSpark offers a rich set of spatial query operators, e.g., range search, k NN, spatio-textual operation, spatial-join, and k NN-join. To achieve high performance, LocationSpark employs various spatial indexes for in-memory data, and guarantees that immutable spatial indexes have low overhead with fault tolerance. In addition, we build two new layers over Spark, namely a query scheduler and a query executor. The query scheduler is responsible for mitigating skew in spatial queries, while the query executor selects the best plan based on the indexes and the nature of the spatial queries. Furthermore, to avoid unnecessary network communication overhead when processing overlapped spatial data, We embed an efficient spatial Bloom filter into LocationSpark's indexes. Finally, LocationSpark tracks frequently accessed spatial data, and dynamically flushes less frequently accessed data into disk. We evaluate our system on real workloads and demonstrate that it achieves an order of magnitude performance gain over a baseline framework.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shanbhag:2016:ASC, author = "Anil Shanbhag and Alekh Jindal and Yi Lu and Samuel Madden", title = "{Amoeba}: a shape changing storage system for big data", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1569--1572", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data partitioning significantly improves the query performance in distributed database systems. A large number of techniques have been proposed to efficiently partition a dataset for a given query workload. However, many modern analytic applications involve ad-hoc or exploratory analysis where users do not have a representative query workload upfront. Furthermore, workloads change over time as businesses evolve or as analysts gain better understanding of their data. Static workload-based data partitioning techniques are therefore not suitable for such settings. In this paper, we describe the demonstration of Amoeba, a distributed storage system which uses adaptive multi-attribute data partitioning to efficiently support ad-hoc as well as recurring queries. Amoeba applies a robust partitioning algorithm such that ad-hoc queries on all attributes have similar performance gains. Thereafter, Amoeba adaptively repartitions the data based on the observed query sequence, i.e., the system improves over time. All along Amoeba offers both adaptivity (i.e., adjustments according to workload changes) as well as robustness (i.e., avoiding performance spikes due to workload changes). We propose to demonstrate Amoeba on scenarios from an internet-of-things startup that tracks user driving patterns. We invite the audience to interactively fire fast ad-hoc queries, observe multi-dimensional adaptivity, and play with a robust/reactive knob in Amoeba. The web front end displays the layout changes, runtime costs, and compares it to Spark with both default and workload-aware partitioning.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Olteanu:2016:FRM, author = "Dan Olteanu and Maximilian Schleich", title = "{F}: regression models over factorized views", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1573--1576", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate F, a system for building regression models over database views. At its core lies the observation that the computation and representation of materialized views, and in particular of joins, entail non-trivial redundancy that is not necessary for the efficient computation of aggregates used for building regression models. F avoids this redundancy by factorizing data and computation and can outperform the state-of-the-art systems MADlib, R, and Python StatsModels by orders of magnitude on real-world datasets. We illustrate how to incrementally build regression models over factorized views using both an in-memory implementation of F and its SQL encoding. We also showcase the effective use of F for model selection: F decouples the data-dependent computation step from the data-independent convergence of model parameters and only performs once the former to explore the entire model space.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rodriguez:2016:SMP, author = "Miguel Rodr{\'\i}guez and Sean Goldberg and Daisy Zhe Wang", title = "{SigmaKB}: multiple probabilistic knowledge base fusion", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1577--1580", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The interest in integrating web-scale knowledge bases (KBs) has intensified in the last several years. Research has focused on knowledge base completion between two KBs with complementary information, lacking any notion of uncertainty or method of handling conflicting information. We present SigmaKB, a knowledge base system that utilizes Consensus Maximization Fusion and user feedback to integrate and improve the query results of a total of 71 KBs. This paper presents the architecture and demonstration details.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Konda:2016:MTBb, author = "Pradap Konda and Sanjib Das and Paul Suganthan G. C. and AnHai Doan and Adel Ardalan and Jeffrey R. Ballard and Han Li and Fatemah Panahi and Haojun Zhang and Jeff Naughton and Shishir Prasad and Ganesh Krishnan and Rohit Deep and Vijay Raghavendra", title = "{Magellan}: toward building entity matching management systems over data science stacks", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1581--1584", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Entity matching (EM) has been a long-standing challenge in data management. Most current EM works, however, focus only on developing matching algorithms. We argue that far more efforts should be devoted to building EM systems. We discuss the limitations of current EM systems, then present Magellan, a new kind of EM systems that addresses these limitations. Magellan is novel in four important aspects. (1) It provides a how-to guide that tells users what to do in each EM scenario, step by step. (2) It provides tools to help users do these steps; the tools seek to cover the entire EM pipeline, not just matching and blocking as current EM systems do. (3) Tools are built on top of the data science stacks in Python, allowing Magellan to borrow a rich set of capabilities in data cleaning, IE, visualization, learning, etc. (4) Magellan provide a powerful scripting environment to facilitate interactive experimentation and allow users to quickly write code to ``patch'' the system. We have extensively evaluated Magellan with 44 students and users at various organizations. In this paper we propose demonstration scenarios that show the promise of the Magellan approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Alkowaileet:2016:LSC, author = "Wail Y. Alkowaileet and Sattam Alsubaiee and Michael J. Carey and Till Westmann and Yingyi Bu", title = "Large-scale complex analytics on semi-structured datasets using {AsterixDB} and {Spark}", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1585--1588", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large quantities of raw data are being generated by many different sources in different formats. Private and public sectors alike acclaim the valuable information and insights that can be mined from such data to better understand the dynamics of everyday life, such as traffic, worldwide logistics, and social behavior. For this reason, storing, managing, and analyzing ``Big Data'' at scale is getting a tremendous amount of attention, both in academia and industry. In this paper, we demonstrate the power of a parallel connection that we have built between Apache Spark and Apache AsterixDB (Incubating) to enable complex analytics such as machine learning and graph analysis on data drawn from large semi-structured data collections. The integration of these two systems allows researchers and data scientists to leverage AsterixDB capabilities, including fast ingestion and indexing of semi-structured data and efficient answering of geo-spatial and fuzzy text queries. Complex data analytics can then be performed on the resulting AsterixDB query output in order to obtain additional insights by leveraging the power of Spark's machine learning and graph libraries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Picado:2016:SIS, author = "Jose Picado and Parisa Ataei and Arash Termehchy and Alan Fern", title = "Schema independent and scalable relational learning by {Castor}", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1589--1592", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Learning novel relations from relational databases is an important problem with many applications in database systems and machine learning. Relational learning algorithms leverage the properties of the database schema to find the definition of the target relation in terms of the existing relations in the database. However, the same data set may be represented under different schemas for various reasons, such as efficiency and data quality. Unfortunately, current relational learning algorithms tend to vary quite substantially over the choice of schema, which complicates their off-the-shelf application. We demonstrate Castor, a relational learning system that efficiently learns the same definitions over common schema variations. The results of Castor are more accurate than well-known learning systems over large data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kannapalli:2016:AWA, author = "Rajeshkumar Kannapalli and Azade Nazi and Mahashweta Das and Gautam Das", title = "{AD-WIRE}: add-on for {Web} item reviewing system", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1593--1596", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Over the past few decades as purchasing options moved online, the widespread use and popularity of online review sites has simultaneously increased. In spite of the fact that a huge extent of buying choices today are driven by numeric scores (e.g., rating a product), detailed reviews play an important role for activities like purchasing an expensive DSLR camera. Since writing a detailed review for an item is usually time-consuming, the number of reviews available in the Web is far from many. In this paper, we build a system AD-WIRE that given a user and an item, our system identifies the top- k meaningful tags to help her review the item easily. AD-WIRE allows a user to compose her review by quickly selecting from among the set of returned tags or writes her own review. AD-WIRE also visualizes the dependency of the tags to different aspects of an item so a user can make an informed decision quickly. The system can be used for different type of the products. The current demonstration is built to explore review writing process for the mobile phones.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chaoji:2016:MLR, author = "Vineet Chaoji and Rajeev Rastogi and Gourav Roy", title = "Machine learning in the real world", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1597--1600", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Machine Learning (ML) has become a mature technology that is being applied to a wide range of business problems such as web search, online advertising, product recommendations, object recognition, and so on. As a result, it has become imperative for researchers and practitioners to have a fundamental understanding of ML concepts and practical knowledge of end-to-end modeling. This tutorial takes a hands-on approach to introducing the audience to machine learning. The first part of the tutorial gives a broad overview and discusses some of the key concepts within machine learning. The second part of the tutorial takes the audience through the end-to-end modeling pipeline for a real-world income prediction problem.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bohm:2016:OAD, author = "Alexander B{\"o}hm and Jens Dittrich and Niloy Mukherjee and Ippokratis Pandis and Rajkumar Sen", title = "Operational analytics data management systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1601--1604", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Prior to mid-2000s, the space of data analytics was mainly confined within the area of decision support systems. It was a long era of isolated enterprise data ware houses curating information from live data sources and of business intelligence software used to query such information. Most data sets were small enough in volume and static enough in velocity to be segregated in warehouses for analysis. Data analysis was not ad-hoc; it required pre-requisite knowledge of underlying data access patterns for the creation of specialized access methods (e.g. covering indexes, materialized views) in order to efficiently execute a set of few focused queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chu:2016:QDC, author = "Xu Chu and Ihab F. Ilyas", title = "Qualitative data cleaning", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1605--1608", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data quality is one of the most important problems in data management, since dirty data often leads to inaccurate data analytics results and wrong business decisions. Data cleaning exercise often consist of two phases: error detection and error repairing. Error detection techniques can either be quantitative or qualitative; and error repairing is performed by applying data transformation scripts or by involving human experts, and sometimes both. In this tutorial, we discuss the main facets and directions in designing qualitative data cleaning techniques. We present a taxonomy of current qualitative error detection techniques, as well as a taxonomy of current data repairing techniques. We will also discuss proposals for tackling the challenges for cleaning ``big data'' in terms of scale and distribution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Larson:2016:MMM, author = "Per-{\AA}ke Larson and Justin Levandoski", title = "Modern main-memory database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1609--1610", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This tutorial provides an overview of recent developments in main-memory database systems. With growing memory sizes and memory prices dropping by a factor of 10 every 5 years, data having a ``primary home'' in memory is now a reality. Main-memory databases eschew many of the traditional architectural tenets of relational database systems that optimized for disk-resident data. Innovative approaches to fundamental issues such as concurrency control and query processing are required to unleash the full performance potential of main-memory databases. The tutorial is focused around design issues and architectural choices that must be made when building a high performance database system optimized for main-memory: data storage and indexing, concurrency control, durability and recovery techniques, query processing and compilation, support for high availability, and ability to support hybrid transactional and analytics workloads. This will be illustrated by example solutions drawn from four state-of-the-art systems: H-Store/VoltDB, Hekaton, HyPeR, and SAP HANA. The tutorial will also cover current and future research trends.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Machanavajjhala:2016:DPW, author = "Ashwin Machanavajjhala and Xi He and Michael Hay", title = "Differential privacy in the wild: a tutorial on current practices \& open challenges", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1611--1614", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Differential privacy has emerged as an important standard for privacy preserving computation over databases containing sensitive information about individuals. Research on differential privacy spanning a number of research areas, including theory, security, database, networks, machine learning, and statistics, over the last decade has resulted in a variety of privacy preserving algorithms for a number of analysis tasks. Despite maturing research efforts, the adoption of differential privacy by practitioners in industry, academia, or government agencies has so far been rare. Hence, in this tutorial, we will first describe the foundations of differentially private algorithm design that cover the state of the art in private computation on tabular data. In the second half of the tutorial we will highlight real world applications on complex data types, and identify research challenges in applying differential privacy to real world applications.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Amer-Yahia:2016:HFC, author = "Sihem Amer-Yahia and Senjuti Basu Roy", title = "Human factors in crowdsourcing", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1615--1618", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Today, crowdsourcing is used to ``taskify'' any job ranging from simple receipt transcription to collaborative editing, fan-subbing, citizen science, and citizen journalism. The crowd is typically volatile, its arrival and departure asynchronous, and its levels of attention and accuracy diverse. Tasks vary in complexity and may necessitate the participation of workers with varying degrees of expertise. Sometimes, workers need to collaborate explicitly and build on each other's contributions to complete a single task. For example, in disaster reporting, CrowdMap allows geographically closed people with diverse and complementary skills, to work together to report details about the course of a typhoon or the aftermath of an earthquake. This uber-ization of human labor requires the understanding of workers motivation in completing a task, their ability to work together in collaborative tasks, as well as, helping workers find relevant tasks. For over 40 years, organization studies have thoroughly examined human factors that affect workers in physical workplaces. More recently, computer scientists have developed algorithms that verify and leverage those findings in a virtual marketplace, in this case, a crowdsourcing platform. The goal of this tutorial is to review those two areas and discuss how their combination may improve workers' experience, task throughput and outcome quality for both micro-tasks and collaborative tasks. We will start with a coverage of motivation theory, team formation, and learning worker profiles. We will then address open research questions that result from this review.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Stoica:2016:TCB, author = "Ion Stoica", title = "Trends and challenges in big data processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1619--1619", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Almost six years ago we started the Spark project at UC Berkeley. Spark is a cluster computing engine that is optimized for in-memory processing, and unifies support for a variety of workloads, including batch, interactive querying, streaming, and iterative computations. Spark is now the most active big data project in the open source community, and is already being used by over one thousand organizations. One of the reasons behind Spark's success has been our early bet on the continuous increase in the memory capacity and the feasibility to fit many realistic workloads in the aggregate memory of typical production clusters. Today, we are witnessing new trends, such as Moore's law slowing down, and the emergence of a variety of computation and storage technologies, such as GPUs, FPGAs, and 3D Xpoint. In this talk, I'll discuss some of the lessons we learned in developing Spark as a unified computation platform, and the implications of today's hardware and software trends on the development of the next generation of big data processing systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rajaraman:2016:DDD, author = "Anand Rajaraman", title = "Data-driven disruption: the view from {Silicon Valley}", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1620--1620", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We live in an era where software is transforming industries, the sciences, and society as a whole. This exciting phenomenon has been described by the phrase ``software is eating the world.'' It is becoming increasingly apparent that data is the fuel powering software's conquests. Data is the new disruptor. It's hard to believe that the first decade of the Big Data era is already behind us. Silicon Valley has been at the forefront of developing and applying data-driven approaches to create disruption at many levels: infrastructure (e.g., Hadoop and Spark), capabilities (e.g., image recognition and machine translation), and killer apps (e.g., self-driving cars and messaging bots). In this talk, we first look back on the past decade and share learnings from the frontlines of data-driven disruption. Looking ahead, we then describe challenges and opportunities for the next decade. Since this has also been a personal journey, we will use examples drawn from personal experience to illustrate each point.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dong:2016:LNV, author = "Xin Luna Dong", title = "Leave no valuable data behind: the crazy ideas and the business", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1621--1621", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the mission ``leave no valuable data behind'', we developed techniques for knowledge fusion to guarantee the correctness of the knowledge. This talk starts with describing a few crazy ideas we have tested. The first, known as ``Knowledge Vault'', used 15 extractors to automatically extract knowledge from 1B+ Webpages, obtaining 3B+ distinct (subject, predicate, object) knowledge triples and predicting well-calibrated probabilities for extracted triples. The second, known as ``Knowledge-Based Trust'', estimated the trustworthiness of 119M webpages and 5.6M websites based on the correctness of their factual information. We then present how we bring the ideas to business in filling the gap between the knowledge at Google Knowledge Graph and the knowledge in the world.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mokbel:2016:LDM, author = "Mohamed Mokbel and Chi-Yin Chow and Walid Aref", title = "Location data management: a tale of two systems and the ``next destination''!", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "13", pages = "1622--1622", month = sep, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:19:51 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In early 2000, we had the vision of ubiquitous location services, where each object is aware of its location, and continuously sends its location to a designated database server. This flood of location data opened the door for a myriad of location-based services that were considered visionary at that time, yet today they are a reality and have become ubiquitous. To realize our early vision, we identified two main challenges that needed to be addressed, namely, scalability and privacy. We have addressed these challenges through two main systems, PLACE and Casper. PLACE, developed at Purdue University from 2000 to 2005, set up the environment for built-in database support of scalable and continuous location-based services. The Casper system, developed at University of Minnesota from 2005 to 2010, was built inside the PLACE server allowing it to provide its high quality scalable service, while maintaining the privacy of its users' locations. This talk will take you through a time journey of location services from 2000 until today, and beyond, highlighting the development efforts of the PLACE and Casper systems, along with their impact on current and future research initiatives in both academia and industry.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chandramouli:2016:QET, author = "Badrish Chandramouli and Raul Castro Fernandez and Jonathan Goldstein and Ahmed Eldawy and Abdul Quamar", title = "{Quill}: efficient, transferable, and rich analytics at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "14", pages = "1623--1634", month = oct, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:14:56 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper introduces Quill (stands for a quadrillion tuples per day), a library and distributed platform for relational and temporal analytics over large datasets in the cloud. Quill exposes a new abstraction for parallel datasets and computation, called ShardedStreamable. This abstraction provides the ability to express efficient distributed physical query plans that are transferable, i.e., movable from offline to real-time and vice versa. ShardedStreamable decouples incremental query logic specification, a small but rich set of data movement operations, and keying; this allows Quill to express a broad space of plans with complex querying functionality, while leveraging existing temporal libraries such as Trill. Quill's layered architecture provides a careful separation of responsibilities with independently useful components, while retaining high performance. We built Quill for the cloud, with a master-less design where a language-integrated client library directly communicates and coordinates with cloud workers using off-the-shelf distributed cloud components such as queues. Experiments on up to 400 cloud machines, and on datasets up to 1TB, find Quill to incur low overheads and outperform SparkSQL by up to orders-of-magnitude for temporal and 6$ \times $ for relational queries, while supporting a rich space of transferable, programmable, and expressive distributed physical query plans.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Walenz:2016:PAD, author = "Brett Walenz and Jun Yang", title = "Perturbation analysis of database queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "14", pages = "1635--1646", month = oct, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:14:56 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present a system, Perada, for parallel perturbation analysis of database queries. Perturbation analysis considers the results of a query evaluated with (a typically large number of) different parameter settings, to help discover leads and evaluate claims from data. Perada simplifies the development of general, ad hoc perturbation analysis by providing a flexible API to support a variety of optimizations such as grouping, memoization, and pruning; by automatically optimizing performance through run-time observation, learning, and adaptation; and by hiding the complexity of concurrency and failures from its developers. We demonstrate Perada's efficacy and efficiency with real workloads applying perturbation analysis to computational journalism.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2016:HBG, author = "Jing Li and Hung-Wei Tseng and Chunbin Lin and Yannis Papakonstantinou and Steven Swanson", title = "{HippogriffDB}: balancing {I/O} and {GPU} bandwidth in big data analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "14", pages = "1647--1658", month = oct, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:14:56 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As data sets grow and conventional processor performance scaling slows, data analytics move towards heterogeneous architectures that incorporate hardware accelerators (notably GPUs) to continue scaling performance. However, existing GPU-based databases fail to deal with big data applications efficiently: their execution model suffers from scalability limitations on GPUs whose memory capacity is limited; existing systems fail to consider the discrepancy between fast GPUs and slow storage, which can counteract the benefit of GPU accelerators. In this paper, we propose HippogriffDB, an efficient, scalable GPU-accelerated OLAP system. It tackles the bandwidth discrepancy using compression and an optimized data transfer path. HippogriffDB stores tables in a compressed format and uses the GPU for decompression, trading GPU cycles for the improved I/O bandwidth. To improve the data transfer efficiency, HippogriffDB introduces a peer-to-peer, multi-threaded data transfer mechanism, directly transferring data from the SSD to the GPU. HippogriffDB adopts a query-over-block execution model that provides scalability using a stream-based approach. The model improves kernel efficiency with the operator fusion and double buffering mechanism. We have implemented HippogriffDB using an NVMe SSD, which talks directly to a commercial GPU. Results on two popular benchmarks demonstrate its scalability and efficiency. HippogriffDB outperforms existing GPU-based databases (YDB) and in-memory data analytics (MonetDB) by 1-2 orders of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zeuch:2016:NIP, author = "Steffen Zeuch and Holger Pirk and Johann-Christoph Freytag", title = "Non-invasive progressive optimization for in-memory databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "14", pages = "1659--1670", month = oct, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:14:56 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Progressive optimization introduces robustness for database workloads against wrong estimates, skewed data, correlated attributes, or outdated statistics. Previous work focuses on cardinality estimates and rely on expensive counting methods as well as complex learning algorithms. In this paper, we utilize performance counters to drive progressive optimization during query execution. The main advantages are that performance counters introduce virtually no costs on modern CPUs and their usage enables a non-invasive monitoring. We present fine-grained cost models to detect differences between estimates and actual costs which enables us to kick-start reoptimization. Based on our cost models, we implement an optimization approach that estimates the individual selectivities of a multi-selection query efficiently. Furthermore, we are able to learn properties like sortedness, skew, or correlation during run-time. In our evaluation we show, that the overhead of our approach is negligible, while performance improvements are convincing. Using progressive optimization, we improve runtime up to a factor of three compared to average run-times and up to a factor of 4,5 compared to worst case run-times. As a result, we avoid costly operator execution orders and; thus, making query execution highly robust.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2016:DSS, author = "J. W. Zhang and Y. C. Tay", title = "{Dscaler}: synthetically scaling a given relational database", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "14", pages = "1671--1682", month = oct, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:14:56 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Dataset Scaling Problem (DSP) defined in previous work states: Given an empirical set of relational tables $D$ and a scale factor $s$, generate a database state $D$ that is similar to $D$ but $s$ times its size. A DSP solution is useful for application development $ (s < 1) $, scalability testing $ (s > 1) $ and anonymization $ (s = 1) $. Current solutions assume all table sizes scale by the same ratio $s$. However, a real database tends to have tables that grow at different rates. This paper therefore considers non-uniform scaling (nuDSP), a DSP generalization where, instead of a single scale factor $s$, tables can scale by different factors. $D$ scaler is the first solution for nuDSP. It follows previous work in achieving similarity by reproducing correlation among the primary and foreign keys. However, it introduces the concept of a correlation database that captures fine-grained, per-tuple correlation. Experiments with well-known real and synthetic datasets $D$ show that $D$ scaler produces $D$ with greater similarity to $D$ than state-of-the-art techniques. Here, similarity is measured by number of tuples, frequency distribution of foreign key references, and multi-join aggregate queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2016:FAI, author = "Sheng Wang and David Maier and Beng Chin Ooi", title = "Fast and adaptive indexing of multi-dimensional observational data", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "14", pages = "1683--1694", month = oct, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:14:56 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Sensing devices generate tremendous amounts of data each day, which include large quantities of multi-dimensional measurements. These data are expected to be immediately available for real-time analytics as they are streamed into storage. Such scenarios pose challenges to state-of-the-art indexing methods, as they must not only support efficient queries but also frequent updates. We propose here a novel indexing method that ingests multi-dimensional observational data in real time. This method primarily guarantees extremely high throughput for data ingestion, while it can be continuously refined in the background to improve query efficiency. Instead of representing collections of points using Minimal Bounding Boxes as in conventional indexes, we model sets of successive points as line segments in hyperspaces, by exploiting the intrinsic value continuity in observational data. This representation reduces the number of index entries and drastically reduces ``over-coverage'' by entries. Experimental results show that our approach handles real-world workloads gracefully, providing both low-overhead indexing and excellent query efficiency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Upadhyaya:2016:POQ, author = "Prasang Upadhyaya and Magdalena Balazinska and Dan Suciu", title = "Price-optimal querying with data {APIs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "14", pages = "1695--1706", month = oct, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:14:56 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data is increasingly being purchased online in data markets and REST APIs have emerged as a favored method to acquire such data. Typically, sellers charge buyers based on how much data they purchase. In many scenarios, buyers need to make repeated calls to the seller's API. The challenge is then for buyers to keep track of the data they purchase and avoid purchasing the same data twice. In this paper, we propose lightweight modifications to data APIs to achieve optimal history-aware pricing so that buyers are only charged once for data that they have purchased and that has not been updated. The key idea behind our approach is the notion of refunds: buyers buy data as needed but have the ability to ask for refunds of data that they had already purchased before. We show that our techniques can provide significant data cost savings while reducing overheads by two orders of magnitude as compared to the state-of-the-art competing approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pirk:2016:VVA, author = "Holger Pirk and Oscar Moll and Matei Zaharia and Sam Madden", title = "{Voodoo} --- a vector algebra for portable database performance on modern hardware", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "14", pages = "1707--1718", month = oct, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 12 10:14:56 MDT 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In-memory databases require careful tuning and many engineering tricks to achieve good performance. Such database performance engineering is hard: a plethora of data and hardware-dependent optimization techniques form a design space that is difficult to navigate for a skilled engineer --- even more so for a query compiler. To facilitate performance-oriented design exploration and query plan compilation, we present Voodoo, a declarative intermediate algebra that abstracts the detailed architectural properties of the hardware, such as multi- or many-core architectures, caches and SIMD registers, without losing the ability to generate highly tuned code. Because it consists of a collection of declarative, vector-oriented operations, Voodoo is easier to reason about and tune than low-level C and related hardware-focused extensions (Intrinsics, OpenCL, CUDA, etc.). This enables our Voodoo compiler to produce (OpenCL) code that rivals and even outperforms the fastest state-of-the-art in memory databases for both GPUs and CPUs. In addition, Voodoo makes it possible to express techniques as diverse as cache-conscious processing, predication and vectorization (again on both GPUs and CPUs) with just a few lines of code. Central to our approach is a novel idea we termed control vectors, which allows a code generating frontend to expose parallelism to the Voodoo compiler in a abstract manner, enabling portable performance across hardware platforms. We used Voodoo to build an alternative backend for MonetDB, a popular open-source in-memory database. Our backend allows MonetDB to perform at the same level as highly tuned in-memory databases, including HyPeR and Ocelot. We also demonstrate Voodoo's usefulness when investigating hardware conscious tuning techniques, assessing their performance on different queries, devices and data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jiang:2016:CQP, author = "Dawei Jiang and Qingchao Cai and Gang Chen and H. V. Jagadish and Beng Chin Ooi and Kian-Lee Tan and Anthony K. H. Tung", title = "Cohort query processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "1", pages = "1--12", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3015270.3015271", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:50 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern Internet applications often produce a large volume of user activity records. Data analysts are interested in cohort analysis, or finding unusual user behavioral trends, in these large tables of activity records. In a traditional database system, cohort analysis queries are both painful to specify and expensive to evaluate. We propose to extend database systems to support cohort analysis. We do so by extending SQL with three new operators. We devise three different evaluation schemes for cohort query processing. Two of them adopt a non-intrusive approach. The third approach employs a columnar based evaluation scheme with optimizations specifically designed for cohort query processing. Our experimental results confirm the performance benefits of our proposed columnar database system, compared against the two non-intrusive approaches that implement cohort queries on top of regular relational databases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2016:RWY, author = "Yubao Wu and Yuchen Bian and Xiang Zhang", title = "Remember where you came from: on the second-order random walk based proximity measures", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "1", pages = "13--24", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3015270.3015272", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:50 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Measuring the proximity between different nodes is a fundamental problem in graph analysis. Random walk based proximity measures have been shown to be effective and widely used. Most existing random walk measures are based on the first-order Markov model, i.e., they assume that the next step of the random surfer only depends on the current node. However, this assumption neither holds in many real-life applications nor captures the clustering structure in the graph. To address the limitation of the existing first-order measures, in this paper, we study the second-order random walk measures, which take the previously visited node into consideration. While the existing first-order measures are built on node-to-node transition probabilities, in the second-order random walk, we need to consider the edge-to-edge transition probabilities. Using incidence matrices, we develop simple and elegant matrix representations for the second-order proximity measures. A desirable property of the developed measures is that they degenerate to their original first-order forms when the effect of the previous step is zero. We further develop Monte Carlo methods to efficiently compute the second-order measures and provide theoretical performance guarantees. Experimental results show that in a variety of applications, the second-order measures can dramatically improve the performance compared to their first-order counterparts.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{George:2016:MIL, author = "Lars George and Bruno Cadonna and Matthias Weidlich", title = "{IL-Miner}: instance-level discovery of complex event patterns", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "1", pages = "25--36", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3015270.3015273", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:50 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Complex event processing (CEP) matches patterns over a continuous stream of events to detect situations of interest. Yet, the definition of an event pattern that precisely characterises a particular situation is challenging: there are manifold dimensions to correlate events, including time windows and value predicates. In the presence of historic event data that is labelled with the situation to detect, event patterns can be learned automatically. To cope with the combinatorial explosion of pattern candidates, existing approaches work on a type-level and discover patterns based on predefined event abstractions, aka event types. Hence, discovery is limited to patterns of a fixed granularity and users face the burden to manually select appropriate event abstractions. We present IL-M iner, a system that discovers event patterns by genuinely working on the instance-level, not assuming a priori knowledge on event abstractions. In a multi-phase process, IL-Miner first identifies relevant abstractions for the construction of event patterns. The set of events explored for pattern discovery is thereby reduced, while still providing formal guarantees on correctness, minimality, and completeness of the discovery result. Experiments using real-world datasets from diverse domains show that IL-Miner discovers a much broader range of event patterns compared to the state-of-the-art in the field.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Psaroudakis:2016:ANA, author = "Iraklis Psaroudakis and Tobias Scheuer and Norman May and Abdelkader Sellami and Anastasia Ailamaki", title = "Adaptive {NUMA}-aware data placement and task scheduling for analytical workloads in main-memory column-stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "2", pages = "37--48", month = oct, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Non-uniform memory access (NUMA) architectures pose numerous performance challenges for main-memory column-stores in scaling up analytics on modern multi-socket multi-core servers. A NUMA-aware execution engine needs a strategy for data placement and task scheduling that prefers fast local memory accesses over remote memory accesses, and avoids an imbalance of resource utilization, both CPU and memory bandwidth, across sockets. State-of-the-art systems typically use a static strategy that always partitions data across sockets, and always allows inter-socket task stealing. In this paper, we show that adapting data placement and task stealing to the workload can improve throughput by up to a factor of 4 compared to a static approach. We focus on highly concurrent workloads dominated by operators working on a single table or table group (copartitioned tables). Our adaptive data placement algorithm tracks the resource utilization of tasks, partitions of tables and table groups, and sockets. When a utilization imbalance across sockets is detected, the algorithm corrects it by moving or repartitioning tables. Also, inter-socket task stealing is dynamically disabled for memory-intensive tasks that could otherwise hurt performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2016:MOC, author = "Tianzheng Wang and Hideaki Kimura", title = "Mostly-optimistic concurrency control for highly contended dynamic workloads on a thousand cores", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "2", pages = "49--60", month = oct, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Future servers will be equipped with thousands of CPU cores and deep memory hierarchies. Traditional concurrency control (CC) schemes---both optimistic and pessimistic---slow down orders of magnitude in such environments for highly contended workloads. Optimistic CC (OCC) scales the best for workloads with few conflicts, but suffers from clobbered reads for high conflict workloads. Although pessimistic locking can protect reads, it floods cache-coherence backbones in deep memory hierarchies and can also cause numerous deadlock aborts. This paper proposes a new CC scheme, mostly-optimistic concurrency control (MOCC), to address these problems. MOCC achieves orders of magnitude higher performance for dynamic workloads on modern servers. The key objective of MOCC is to avoid clobbered reads for high conflict workloads, without any centralized mechanisms or heavyweight interthread communication. To satisfy such needs, we devise a native, cancellable reader-writer spinlock and a serializable protocol that can acquire, release and re-acquire locks in any order without expensive interthread communication. For low conflict workloads, MOCC maintains OCC's high performance without taking read locks. Our experiments with high conflict YCSB workloads on a 288-core server reveal that MOCC performs $ 8 \times $ and $ 23 \times $ faster than OCC and pessimistic locking, respectively. It achieves 17 million TPS for TPC-C and more than 110 million TPS for YCSB without conflicts, $ 170 \times $ faster than pessimistic methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2016:EIA, author = "Sibo Wang and Xiaokui Xiao and Yin Yang and Wenqing Lin", title = "Effective indexing for approximate constrained shortest path queries on large road networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "2", pages = "61--72", month = oct, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In a constrained shortest path (CSP) query, each edge in the road network is associated with both a length and a cost. Given an origin $s$, a destination $t$, and a cost constraint $ \theta $, the goal is to find the shortest path from $s$ to $t$ whose total cost does not exceed $ \theta $. Because exact CSP is NP-hard, previous work mostly focuses on approximate solutions. Even so, existing methods are still prohibitively expensive for large road networks. Two main reasons are (i) that they fail to utilize the special properties of road networks and (ii) that most of them process queries without indices; the few existing indices consume large amounts of memory and yet have limited effectiveness in reducing query costs. Motivated by this, we propose COLA, the first practical solution for approximate CSP processing on large road networks. COLA exploits the facts that a road network can be effectively partitioned, and that there exists a relatively small set of landmark vertices that commonly appear in CSP results. Accordingly, COLA indexes the vertices lying on partition boundaries, and applies an on-the-fly algorithm called $ \alpha $-Dijk for path computation within a partition, which effectively prunes paths based on landmarks. Extensive experiments demonstrate that on continent-sized road networks, COLA answers an approximate CSP query in sub-second time, whereas existing methods take hours. Interestingly, even without an index, the $ \alpha $-Dijk algorithm in COLA still outperforms previous solutions by more than an order of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2016:THP, author = "Qun Huang and Patrick P. C. Lee", title = "Toward high-performance distributed stream processing via approximate fault tolerance", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "73--84", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Fault tolerance is critical for distributed stream processing systems, yet achieving error-free fault tolerance often incurs substantial performance overhead. We present AF-Stream, a distributed stream processing system that addresses the trade-off between performance and accuracy in fault tolerance. AF-Stream builds on a notion called approximate fault tolerance, whose idea is to mitigate backup overhead by adaptively issuing backups, while ensuring that the errors upon failures are bounded with theoretical guarantees. Our AF-Stream design provides an extensible programming model for incorporating general streaming algorithms, and also exports only few threshold parameters for configuring approximation fault tolerance. Experiments on Amazon EC2 show that AF-Stream maintains high performance (compared to no fault tolerance) and high accuracy after multiple failures (compared to no failures) under various streaming algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dai:2016:PCD, author = "Jian Dai and Bin Yang and Chenjuan Guo and Christian S. Jensen and Jilin Hu", title = "Path cost distribution estimation using trajectory data", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "85--96", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the growing volumes of vehicle trajectory data, it becomes increasingly possible to capture time-varying and uncertain travel costs in a road network, including travel time and fuel consumption. The current paradigm represents a road network as a weighted graph; it blasts trajectories into small fragments that fit the under-lying edges to assign weights to edges; and it then applies a routing algorithm to the resulting graph. We propose a new paradigm, the hybrid graph, that targets more accurate and more efficient path cost distribution estimation. The new paradigm avoids blasting trajectories into small fragments and instead assigns weights to paths rather than simply to the edges. We show how to compute path weights using trajectory data while taking into account the travel cost dependencies among the edges in the paths. Given a departure time and a query path, we show how to select an optimal set of weights with associated paths that cover the query path and such that the weights enable the most accurate joint cost distribution estimation for the query path. The cost distribution of the query path is then computed accurately using the joint distribution. Finally, we show how the resulting method for computing cost distributions of paths can be integrated into existing routing algorithms. Empirical studies with substantial trajectory data from two different cities offer insight into the design properties of the proposed method and confirm that the method is effective in real-world settings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sariyuce:2016:FHC, author = "Ahmet Erdem Sariy{\"u}ce and Ali Pinar", title = "Fast hierarchy construction for dense subgraphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "97--108", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Discovering dense subgraphs and understanding the relations among them is a fundamental problem in graph mining. We want to not only identify dense subgraphs, but also build a hierarchy among them (e.g., larger but sparser subgraphs formed by two smaller dense subgraphs). Peeling algorithms (k -core, k -truss, and nucleus decomposition) have been effective to locate many dense subgraphs. However, constructing a hierarchical representation of density structure, even correctly computing the connected k -cores and k -trusses, have been mostly overlooked. Keeping track of connected components during peeling requires an additional traversal operation, which is as expensive as the peeling process. In this paper, we start with a thorough survey and point to nuances in problem formulations that lead to significant differences in runtimes. We then propose efficient and generic algorithms to construct the hierarchy of dense subgraphs for k -core, k -truss, or any nucleus decomposition. Our algorithms leverage the disjoint-set forest data structure to efficiently construct the hierarchy during traversal. Furthermore, we introduce a new idea to avoid traversal. We construct the subgraphs while visiting neighborhoods in the peeling process, and build the relations to previously constructed subgraphs. We also consider an existing idea to find the k -core hierarchy and adapt for our objectives efficiently. Experiments on different types of large scale real-world networks show significant speedups over naive algorithms and existing alternatives. Our algorithms also outperform the hypothetical limits of any possible traversal-based solution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2016:SEE, author = "Xuhong Zhang and Jun Wang and Jiangling Yin", title = "{Sapprox}: enabling efficient and accurate approximations on sub-datasets with distribution-aware online sampling", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "109--120", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we aim to enable both efficient and accurate approximations on arbitrary sub-datasets of a large dataset. Due to the prohibitive storage overhead of caching offline samples for each sub-dataset, existing offline sample based systems provide high accuracy results for only a limited number of sub-datasets, such as the popular ones. On the other hand, current online sample based approximation systems, which generate samples at runtime, do not take into account the uneven storage distribution of a sub-dataset. They work well for uniform distribution of a sub-dataset while suffer low sampling efficiency and poor estimation accuracy on unevenly distributed sub-datasets. To address the problem, we develop a distribution aware method called Sapprox. Our idea is to collect the occurrences of a sub-dataset at each logical partition of a dataset (storage distribution) in the distributed system, and make good use of such information to facilitate online sampling. There are three thrusts in Sapprox. First, we develop a probabilistic map to reduce the exponential number of recorded sub-datasets to a linear one. Second, we apply the cluster sampling with unequal probability theory to implement a distribution-aware sampling method for efficient online sub-dataset sampling. Third, we quantitatively derive the optimal sampling unit size in a distributed file system by associating it with approximation costs and accuracy. We have implemented Sapprox into Hadoop ecosystem as an example system and open sourced it on GitHub. Our comprehensive experimental results show that Sapprox can achieve a speedup by up to $ 20 \times $ over the precise execution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ren:2016:MQO, author = "Xuguang Ren and Junhu Wang", title = "Multi-query optimization for subgraph isomorphism search", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "121--132", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Existing work on subgraph isomorphism search mainly focuses on a-query-at-a-time approaches: optimizing and answering each query separately. When multiple queries arrive at the same time, sequential processing is not always the most efficient. In this paper, we study multi-query optimization for subgraph isomorphism search. We first propose a novel method for efficiently detecting useful common sub-graphs and a data structure to organize them. Then we propose a heuristic algorithm based on the data structure to compute a query execution order so that cached intermediate results can be effectively utilized. To balance memory usage and the time for cached results retrieval, we present a novel structure for caching the intermediate results. We provide strategies to revise existing single-query subgraph isomorphism algorithms to seamlessly utilize the cached results, which leads to significant performance improvement. Extensive experiments verified the effectiveness of our solution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Simpson:2016:ECF, author = "Michael Simpson and Venkatesh Srinivasan and Alex Thomo", title = "Efficient computation of feedback arc set at web-scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "133--144", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The minimum feedback arc set problem is an NP-hard problem on graphs that seeks a minimum set of arcs which, when removed from the graph, leave it acyclic. In this work, we investigate several approximations for computing a minimum feedback arc set with the goal of comparing the quality of the solutions and the running times. Our investigation is motivated by applications in Social Network Analysis such as misinformation removal and label propagation. We present careful algorithmic engineering for multiple algorithms to improve the scalability of each approach. In particular, two approaches we optimize (one greedy and one randomized) provide a nice balance between feedback arc set size and running time complexity. We experimentally compare the performance of a wide range of algorithms on a broad selection of large online networks including Twitter, LiveJournal, and the Clueweb12 dataset. The experiments reveal that our greedy and randomized implementations outperform the other approaches by simultaneously computing a feedback arc set of competitive size and scaling to web-scale graphs with billions of vertices and tens of billions of arcs. Finally, we extend the algorithms considered to the probabilistic case in which arcs are realized with some fixed probability and provide detailed experimental comparisons.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Antenucci:2016:DQP, author = "Dolan Antenucci and Michael R. Anderson and Michael Cafarella", title = "A declarative query processing system for nowcasting", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "145--156", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Nowcasting is the practice of using social media data to quantify ongoing real-world phenomena. It has been used by researchers to measure flu activity, unemployment behavior, and more. However, the typical nowcasting workflow requires either slow and tedious manual searching of relevant social media messages or automated statistical approaches that are prone to spurious and low-quality results. In this paper, we propose a method for declaratively specifying a nowcasting model; this method involves processing a user query over a very large social media database, which can take hours. Due to the human-in-the-loop nature of constructing nowcasting models, slow runtimes place an extreme burden on the user. Thus we also propose a novel set of query optimization techniques, which allow users to quickly construct nowcasting models over very large datasets. Further, we propose a novel query quality alarm that helps users estimate phenomena even when historical ground truth data is not available. These contributions allow us to build a declarative nowcasting data management system, RaccoonDB, which yields high-quality results in interactive time. We evaluate RaccoonDB using 40 billion tweets collected over five years. We show that our automated system saves work over traditional manual approaches while improving result quality---57\% more accurate in our user study---and that its query optimizations yield a 424x speedup, allowing it to process queries 123x faster than a 300-core Spark cluster, using only 10\% of the computational resources.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lulli:2016:NDS, author = "Alessandro Lulli and Matteo Dell'Amico and Pietro Michiardi and Laura Ricci", title = "{NG-DBSCAN}: scalable density-based clustering for arbitrary data", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "157--168", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present NG-DBSCAN, an approximate density-based clustering algorithm that operates on arbitrary data and any symmetric distance measure. The distributed design of our algorithm makes it scalable to very large datasets; its approximate nature makes it fast, yet capable of producing high quality clustering results. We provide a detailed overview of the steps of NG-DBSCAN, together with their analysis. Our results, obtained through an extensive experimental campaign with real and synthetic data, substantiate our claims about NG-DBSCAN's performance and scalability.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Neamtu:2016:ITS, author = "Rodica Neamtu and Ramoza Ahsan and Elke Rundensteiner and Gabor Sarkozy", title = "Interactive time series exploration powered by the marriage of similarity distances", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "169--180", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Finding similar trends among time series data is critical for applications ranging from financial planning to policy making. The detection of these multifaceted relationships, especially time warped matching of time series of different lengths and alignments is prohibitively expensive to compute. To achieve real time responsiveness on large time series datasets, we propose a novel paradigm called Online Exploration of Time Series (ONEX) employing a powerful one-time preprocessing step that encodes critical similarity relationships to support subsequent rapid data exploration. Since the encoding of a huge number of pairwise similarity relationships for all variable lengths time series segments is not feasible, our work rests on the important insight that clustering with inexpensive point-to-point distances such as the Euclidean Distance can support subsequent time warped matching. Our ONEX framework overcomes the prohibitive computational costs associated with a more robust elastic distance namely the DTW by applying it over the surprisingly compact knowledge base instead of the raw data. Our comparative study reveals that ONEX is up to 19\% more accurate and several times faster than the state-of-the-art. Beyond being a highly accurate and fast domain independent solution, ONEX offers a truly interactive exploration experience supporting novel time series operations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2016:CLI, author = "Youhuan Li and Lei Zou and Huaming Zhang and Dongyan Zhao", title = "Computing longest increasing subsequences over sequential data streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "181--192", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we propose a data structure, a quadruple neighbor list (QN-list, for short), to support real time queries of all longest increasing subsequence (LIS) and LIS with constraints over sequential data streams. The QN-List built by our algorithm requires $ O(w) $ space, where w is the time window size. The running time for building the initial QN-List takes $ O(w \log w) $ time. Applying the QN-List, insertion of the new item takes $ O(\log w) $ time and deletion of the first item takes $ O(w) $ time. To the best of our knowledge, this is the first work to support both LIS enumeration and LIS with constraints computation by using a single uniform data structure for real time sequential data streams. Our method outperforms the state-of-the-art methods in both time and space cost, not only theoretically, but also empirically.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chirigati:2016:KEU, author = "Fernando Chirigati and Jialu Liu and Flip Korn and You (Will) Wu and Cong Yu and Hao Zhang", title = "Knowledge exploration using tables on the web", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "193--204", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The increasing popularity of mobile device usage has ushered in many features in modern search engines that help users with various information needs. One of those needs is Knowledge Exploration, where related documents are returned in response to a user query, either directly through right-hand side knowledge panels or indirectly through navigable sections underneath individual search results. Existing knowledge exploration features have relied on a combination of Knowledge Bases and query logs. In this paper, we propose Knowledge Carousels of two modalities, namely sideways and downwards, that facilitate exploration of IS-A and HAS-A relationships, respectively, with regard to an entity-seeking query, based on leveraging the large corpus of tables on the Web. This brings many technical challenges, including associating correct carousels with the search entity, selecting the best carousel from the candidates, and finding titles that best describe the carousel. We describe how we address these challenges and also experimentally demonstrate through user studies that our approach produces better result sets than baseline approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2016:HEI, author = "Sibo Wang and Youze Tang and Xiaokui Xiao and Yin Yang and Zengxiang Li", title = "{HubPPR}: effective indexing for approximate {Personalized Pagerank}", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "205--216", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/pagerank.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Personalized PageRank (PPR) computation is a fundamental operation in web search, social networks, and graph analysis. Given a graph $G$, a source $s$, and a target $t$, the PPR query $ \Pi (s, t)$ returns the probability that a random walk on $G$ starting from $s$ terminates at $t$. Unlike global PageRank which can be effectively pre-computed and materialized, the PPR result depends on both the source and the target, rendering results materialization infeasible for large graphs. Existing indexing techniques have rather limited effectiveness; in fact, the current state-of-the-art solution, BiPPR, answers individual PPR queries without pre-computation or indexing, and yet it outperforms all previous index-based solutions. Motivated by this, we propose HubPPR, an effective indexing scheme for PPR computation with controllable tradeoffs for accuracy, query time, and memory consumption. The main idea is to pre-compute and index auxiliary information for selected hub nodes that are often involved in PPR processing. Going one step further, we extend HubPPR to answer top-$k$ PPR queries, which returns the $k$ nodes with the highest PPR values with respect to a source $s$, among a given set $T$ of target nodes. Extensive experiments demonstrate that compared to the current best solution BiPPR, HubPPR achieves up to 10x and 220x speedup for PPR and top-$k$ PPR processing, respectively, with moderate memory consumption. Notably, with a single commodity server, HubPPR answers a top-$k$ PPR query in seconds on graphs with billions of edges, with high accuracy and strong result quality guarantees.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lai:2016:SDS, author = "Longbin Lai and Lu Qin and Xuemin Lin and Ying Zhang and Lijun Chang and Shiyu Yang", title = "Scalable distributed subgraph enumeration", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "217--228", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Subgraph enumeration aims to find all the subgraphs of a large data graph that are isomorphic to a given pattern graph. As the subgraph isomorphism operation is computationally intensive, researchers have recently focused on solving this problem in distributed environments, such as MapReduce and Pregel. Among them, the state-of-the-art algorithm, Twin TwigJoin, is proven to be instance optimal based on a left-deep join framework. However, it is still not scalable to large graphs because of the constraints in the left-deep join framework and that each decomposed component (join unit) must be a star. In this paper, we propose SEED --- a scalable sub-graph enumeration approach in the distributed environment. Compared to Twin TwigJoin, SEED returns optimal solution in a generalized join framework without the constraints in Twin TwigJoin. We use both star and clique as the join units, and design an effective distributed graph storage mechanism to support such an extension. We develop a comprehensive cost model, that estimates the number of matches of any given pattern graph by considering power-law degree distribution in the data graph. We then generalize the left-deep join framework and develop a dynamic-programming algorithm to compute an optimal bushy join plan. We also consider overlaps among the join units. Finally, we propose clique compression to further improve the algorithm by reducing the number of the intermediate results. Extensive performance studies are conducted on several real graphs, one containing billions of edges. The results demonstrate that our algorithm outperforms all other state-of-the-art algorithms by more than one order of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fujiwara:2016:FAL, author = "Yasuhiro Fujiwara and Yasutoshi Ida and Junya Arai and Mai Nishimura and Sotetsu Iwamura", title = "Fast algorithm for the lasso based {$ L_1 $}-graph construction", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "229--240", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The lasso-based $ L_1$-graph is used in many applications since it can effectively model a set of data points as a graph. The lasso is a popular regression approach and the $ L_1$ -graph represents data points as nodes by using the regression result. More specifically, by solving the $ L_1$-optimization problem of the lasso, the sparse regression coefficients are used to obtain the weights of the edges in the graph. Conventional graph structures such as k -NN graph use two steps, adjacency searching and weight selection, for constructing the graph whereas the lasso-based $ L_1$ -graph derives the adjacency structure as well as the edge weights simultaneously by using a coordinate descent. However, the construction cost of the lasso-based $ L_1$ -graph is impractical for large data sets since the coordinate descent iteratively updates the weights of all edges until convergence. Our proposal, Castnet, can efficiently construct the lasso-based $ L_1$ -graph. In order to avoid updating the weights of all edges, we prune edges that cannot have nonzero weights before entering the iterations. In addition, we update edge weights only if they are nonzero in the iterations. Experiments show that Castnet is significantly faster than existing approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhai:2016:RTS, author = "Ennan Zhai and Zhenhua Li and Zhenyu Li and Fan Wu and Guihai Chen", title = "Resisting tag spam by leveraging implicit user behaviors", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "241--252", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Tagging systems are vulnerable to tag spam attacks. However, defending against tag spam has been challenging in practice, since adversaries can easily launch spam attacks in various ways and scales. To deeply understand users' tagging behaviors and explore more effective defense, this paper first conducts measurement experiments on public datasets of two representative tagging systems: Del.icio.us and CiteULike. Our key finding is that a significant fraction of correct tag-resource annotations are contributed by a small number of implicit similarity cliques, where users annotate common resources with similar tags. Guided by the above finding, we propose a new service, called Spam-Resistance-as-a-Service (or SRaaS), to effectively defend against heterogeneous tag spam attacks even at very large scales. At the heart of SRaaS is a novel reputation assessment protocol, whose design leverages the implicit similarity cliques coupled with the social networks inherent to typical tagging systems. With such a design, SRaaS manages to offer provable guarantees on diminishing the influence of tag spam attacks. We build an SRaaS prototype and evaluate it using a large-scale spam-oriented research dataset (which is much more polluted by tag spam than Del.icio.us and CiteULike datasets). Our evaluational results demonstrate that SRaaS outperforms existing tag spam defenses deployed in real-world systems, while introducing low overhead.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2016:GFE, author = "Xiaowei Chen and Yongkun Li and Pinghui Wang and John C. S. Lui", title = "A general framework for estimating graphlet statistics via random walk", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "253--264", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graphlets are induced subgraph patterns and have been frequently applied to characterize the local topology structures of graphs across various domains, e.g., online social networks (OSNs) and biological networks. Discovering and computing graphlet statistics are highly challenging. First, the massive size of real-world graphs makes the exact computation of graphlets extremely expensive. Secondly, the graph topology may not be readily available so one has to resort to web crawling using the available application programming interfaces (APIs). In this work, we propose a general and novel framework to estimate graphlet statistics of `` any size. '' Our framework is based on collecting samples through consecutive steps of random walks. We derive an analytical bound on the sample size (via the Chernoff--Hoeffding technique) to guarantee the convergence of our unbiased estimator. To further improve the accuracy, we introduce two novel optimization techniques to reduce the lower bound on the sample size. Experimental evaluations demonstrate that our methods outperform the state-of-the-art method up to an order of magnitude both in terms of accuracy and time cost.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lin:2016:FMS, author = "Chunbin Lin and Benjamin Mandel and Yannis Papakonstantinou and Matthias Springer", title = "Fast in-memory {SQL} analytics on typed graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "265--276", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study a class of graph analytics SQL queries, which we call relationship queries. These queries involving aggregation, join, semijoin, intersection and selection are a wide superset of fixed-length graph reachability queries and of tree pattern queries. We present real-world OLAP scenarios, where efficient relationship queries are needed. However, row stores, column stores and graph databases are unacceptably slow in such OLAP scenarios. We propose a GQ-Fast database, which is an indexed database that roughly corresponds to efficient encoding of annotated adjacency lists that combines salient features of column-based organization, indexing and compression. GQ-Fast uses a bottom-up fully pipelined query execution model, which enables (a) aggressive compression (e.g., compressed bitmaps and Huffman) and (b) avoids intermediate results that consist of row IDs (which are typical in column databases). GQ-Fast compiles query plans into executable C++ source code. Besides achieving runtime efficiency, GQ-Fast also reduces main memory requirements because, unlike column databases, GQ-Fast selectively allows dense forms of compression including heavy-weight compressions, which do not support random access. We used GQ-Fast to accelerate queries for two OLAP dashboards in the biomedical field. GQ-Fast outperforms PostgreSQL by 2--4 orders of magnitude and MonetDB, Vertica and Neo4j by 1--3 orders of magnitude when all of them are running on RAM. Our experiments dissect GQ-Fast's advantage between (i) the use of compiled code, (ii) the bottom-up pipelining execution strategy, and (iii) the use of dense structures. Other analysis and experiments show the space savings of GQ-Fast due to the appropriate use of compression methods. We also show that the runtime penalty incurred by the dense compression methods decreases as the number of CPU cores increases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2016:SDA, author = "Zheng Li and Tingjian Ge", title = "Stochastic data acquisition for answering queries as time goes by", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "3", pages = "277--288", month = nov, year = "2016", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Dec 1 09:02:03 MST 2016", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data and actions are tightly coupled. On one hand, data analysis results trigger decision making and actions. On the other hand, the action of acquiring data is the very first step in the whole data processing pipeline. Data acquisition almost always has some costs, which could be either monetary costs or computing resource costs such as sensor battery power, network transfers, or I/O costs. Using out-dated data to answer queries can avoid the data acquisition costs, but there is a penalty of potentially inaccurate results. Given a sequence of incoming queries over time, we study the problem of sequential decision making on when to acquire data and when to use existing versions to answer each query. We propose two approaches to solve this problem using reinforcement learning and tailored locality-sensitive hashing. A systematic empirical study using two real-world datasets shows that our approaches are effective and efficient.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dai:2016:FPI, author = "Haipeng Dai and Muhammad Shahzad and Alex X. Liu and Yuankun Zhong", title = "Finding persistent items in data streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "4", pages = "289--300", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3025111.3025112", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Frequent item mining, which deals with finding items that occur frequently in a given data stream over a period of time, is one of the heavily studied problems in data stream mining. A generalized version of frequent item mining is the persistent item mining, where a persistent item, unlike a frequent item, does not necessarily occur more frequently compared to other items over a short period of time, rather persists and occurs more frequently over a long period of time. To the best of our knowledge, there is no prior work on mining persistent items in a data stream. In this paper, we address the fundamental problem of finding persistent items in a given data stream during a given period of time at any given observation point. We propose a novel scheme, PIE, that can accurately identify each persistent item with a probability greater than any desired false negative rate (FNR) while using a very small amount of memory. The key idea of PIE is that it uses Raptor codes to encode the ID of each item that appears at the observation point during a measurement period and stores only a few bits of the encoded ID in the memory of that observation point during that measurement period. The item that is persistent occurs in enough measurement periods that enough encoded bits for the ID can be retrieved from the observation point to decode them correctly and get the ID of the persistent item. We implemented and extensively evaluated PIE using three real network traffic traces and compared its performance with two prior adapted schemes. Our results show that not only PIE achieves the desired FNR in every scenario, its FNR, on average, is 19.5 times smaller than the FNR of the best adapted prior art.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xu:2016:BSD, author = "Shuotao Xu and Sungjin Lee and Sang-Woo Jun and Ming Liu and Jamey Hicks and Arvind", title = "{Bluecache}: a scalable distributed flash-based key--value store", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "4", pages = "301--312", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3025111.3025113", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A key--value store (KVS), such as memcached and Redis, is widely used as a caching layer to augment the slower persistent backend storage in data centers. DRAM-based KVS provides fast key--value access, but its scalability is limited by the cost, power and space needed by the machine cluster to support a large amount of DRAM. This paper offers a 10X to 100X cheaper solution based on flash storage and hardware accelerators. In BlueCache key--value pairs are stored in flash storage and all KVS operations, including the flash controller are directly implemented in hardware. Furthermore, BlueCache includes a fast interconnect between flash controllers to provide a scalable solution. We show that BlueCache has 4.18X higher throughput and consumes 25X less power than a flash-backed KVS software implementation on x86 servers. We further show that BlueCache can outperform DRAM-based KVS when the latter has more than 7.4\% misses for a read-intensive aplication. BlueCache is an attractive solution for both rack-level appliances and data-center-scale key--value cache.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2016:GPP, author = "Qi Fan and Dongxiang Zhang and Huayu Wu and Kian-Lee Tan", title = "A general and parallel platform for mining co-movement patterns over large-scale trajectories", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "4", pages = "313--324", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3025111.3025114", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Discovering co-movement patterns from large-scale trajectory databases is an important mining task and has a wide spectrum of applications. Previous studies have identified several types of interesting co-movement patterns and show-cased their usefulness. In this paper, we make two key contributions to this research field. First, we propose a more general co-movement pattern to unify those defined in the past literature. Second, we propose two types of parallel and scalable frameworks and deploy them on Apache Spark. To the best of our knowledge, this is the first work to mine co-movement patterns in real life trajectory databases with hundreds of millions of points. Experiments on three real life large-scale trajectory datasets have verified the efficiency and scalability of our proposed solutions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shao:2016:VTE, author = "Zhou Shao and Muhammad Aamir Cheema and David Taniar and Hua Lu", title = "{VIP-Tree}: an effective index for indoor spatial queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "4", pages = "325--336", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3025111.3025115", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Due to the growing popularity of indoor location-based services, indoor data management has received significant research attention in the past few years. However, we observe that the existing indexing and query processing techniques for the indoor space do not fully exploit the properties of the indoor space. Consequently, they provide below par performance which makes them unsuitable for large indoor venues with high query workloads. In this paper, we propose two novel indexes called Indoor Partitioning Tree (IP-Tree) and Vivid IP-Tree (VIP-Tree) that are carefully designed by utilizing the properties of indoor venues. The proposed indexes are lightweight, have small pre-processing cost and provide near-optimal performance for shortest distance and shortest path queries. We also present efficient algorithms for other spatial queries such as k nearest neighbors queries and range queries. Our extensive experimental study on real and synthetic data sets demonstrates that our proposed indexes outperform the existing algorithms by several orders of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Arulraj:2016:WBL, author = "Joy Arulraj and Matthew Perron and Andrew Pavlo", title = "Write-behind logging", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "4", pages = "337--348", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3025111.3025116", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The design of the logging and recovery components of database management systems (DBMSs) has always been influenced by the difference in the performance characteristics of volatile (DRAM) and non-volatile storage devices (HDD/SSDs). The key assumption has been that non-volatile storage is much slower than DRAM and only supports block-oriented read/writes. But the arrival of new non-volatile memory (NVM) storage that is almost as fast as DRAM with fine-grained read/writes invalidates these previous design choices. This paper explores the changes that are required in a DBMS to leverage the unique properties of NVM in systems that still include volatile DRAM. We make the case for a new logging and recovery protocol, called write-behind logging, that enables a DBMS to recover nearly instantaneously from system failures. The key idea is that the DBMS logs what parts of the database have changed rather than how it was changed. Using this method, the DBMS flushes the changes to the database { before} recording them in the log. Our evaluation shows that this protocol improves a DBMS's transactional throughput by 1.3$ \times $, reduces the recovery time by more than two orders of magnitude, and shrinks the storage footprint of the DBMS on NVM by 1.5$ \times $. We also demonstrate that our logging protocol is compatible with standard replication schemes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Papadopoulos:2016:TAD, author = "Stavros Papadopoulos and Kushal Datta and Samuel Madden and Timothy Mattson", title = "The {TileDB} array data storage manager", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "4", pages = "349--360", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3025111.3025117", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present a novel storage manager for multi-dimensional arrays that arise in scientific applications, which is part of a larger scientific data management system called TileDB. In contrast to existing solutions, TileDB is optimized for both dense and sparse arrays. Its key idea is to organize array elements into ordered collections called fragments. Each fragment is dense or sparse, and groups contiguous array elements into data tiles of fixed capacity. The organization into fragments turns random writes into sequential writes, and, coupled with a novel read algorithm, leads to very efficient reads. TileDB enables parallelization via multi-threading and multi-processing, offering thread-/process-safety and atomicity via lightweight locking. We show that TileDB delivers comparable performance to the HDF5 dense array storage manager, while providing much faster random writes. We also show that TileDB offers substantially faster reads and writes than the SciDB array database system with both dense and sparse arrays. Finally, we demonstrate that TileDB is considerably faster than adaptations of the Vertica relational column-store for dense array storage management, and at least as fast for the case of sparse arrays.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zheng:2016:DDA, author = "Yudian Zheng and Guoliang Li and Reynold Cheng", title = "{DOCS}: a domain-aware crowdsourcing system using knowledge bases", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "4", pages = "361--372", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3025111.3025118", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Crowdsourcing is a new computing paradigm that harnesses human effort to solve computer-hard problems, such as entity resolution and photo tagging. The crowd (or workers) have diverse qualities and it is important to effectively model a worker's quality. Most of existing worker models assume that workers have the same quality on different tasks. In practice, however, tasks belong to a variety of diverse domains, and workers have different qualities on different domains. For example, a worker who is a basketball fan should have better quality for the task of labeling a photo related to ' Stephen Curry ' than the one related to ' Leonardo DiCaprio '. In this paper, we study how to leverage domain knowledge to accurately model a worker's quality. We examine using knowledge base (KB), e.g., Wikipedia and Freebase, to detect the domains of tasks and workers. We develop Domain Vector Estimation, which analyzes the domains of a task with respect to the KB. We also study Truth Inference, which utilizes the domain-sensitive worker model to accurately infer the true answer of a task. We design an Online Task Assignment algorithm, which judiciously and efficiently assigns tasks to appropriate workers. To implement these solutions, we have built DOCS, a system deployed on the Amazon Mechanical Turk. Experiments show that DOCS performs much better than the state-of-the-art approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2016:LHC, author = "Yue Wang and Alexandra Meliou and Gerome Miklau", title = "Lifting the haze off the cloud: a consumer-centric market for database computation in the cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "4", pages = "373--384", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3025111.3025119", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The availability of public computing resources in the cloud has revolutionized data analysis, but requesting cloud resources often involves complex decisions for consumers. Estimating the completion time and cost of a computation and requesting the appropriate cloud resources are challenging tasks even for an expert user. We propose a new market-based framework for pricing computational tasks in the cloud. Our framework introduces an agent between consumers and cloud providers. The agent takes data and computational tasks from users, estimates time and cost for evaluating the tasks, and returns to consumers contracts that specify the price and completion time. Our framework can be applied directly to existing cloud markets without altering the way cloud providers offer and price services. In addition, it simplifies cloud use for consumers by allowing them to compare contracts, rather than choose resources directly. We present design, analytical, and algorithmic contributions focusing on pricing computation contracts, analyzing their properties, and optimizing them in complex workflows. We conduct an experimental evaluation of our market framework over a real-world cloud service and demonstrate empirically that our market ensures three key properties: (a) that consumers benefit from using the market due to competitiveness among agents, (b) that agents have an incentive to price contracts fairly, and (c) that inaccuracies in estimates do not pose a significant risk to agents' profits. Finally, we present a fine-grained pricing mechanism for complex workflows and show that it can increase agent profits by more than an order of magnitude in some cases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yu:2016:TBO, author = "Jia Yu and Mohamed Sarwat", title = "Two birds, one stone: a fast, yet lightweight, indexing scheme for modern database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "4", pages = "385--396", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3025111.3025120", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Classic database indexes (e.g., B$^+$ -Tree), though speed up queries, suffer from two main drawbacks: (1) An index usually yields 5\% to 15\% additional storage overhead which results in non-ignorable dollar cost in big data scenarios especially when deployed on modern storage devices. (2) Maintaining an index incurs high latency because the DBMS has to locate and update those index pages affected by the underlying table changes. This paper proposes Hippo a fast, yet scalable, database indexing approach. It significantly shrinks the index storage and mitigates maintenance overhead without compromising much on the query execution performance. Hippo stores disk page ranges instead of tuple pointers in the indexed table to reduce the storage space occupied by the index. It maintains simplified histograms that represent the data distribution and adopts a page grouping technique that groups contiguous pages into page ranges based on the similarity of their index key attribute distributions. When a query is issued, Hippo leverages the page ranges and histogram-based page summaries to recognize those pages such that their tuples are guaranteed not to satisfy the query predicates and inspects the remaining pages. Experiments based on real and synthetic datasets show that Hippo occupies up to two orders of magnitude less storage space than that of the B$^+$ -Tree while still achieving comparable query execution performance to that of the B$^+$ -Tree for 0.1\% --- 1\% selectivity factors. Also, the experiments show that Hippo outperforms BRIN (Block Range Index) in executing queries with various selectivity factors. Furthermore, Hippo achieves up to three orders of magnitude less maintenance overhead and up to an order of magnitude higher throughput (for hybrid query/update workloads) than its counterparts.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2016:HMF, author = "Zheng Li and Tingjian Ge", title = "History is a mirror to the future: best-effort approximate complex event matching with insufficient resources", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "4", pages = "397--408", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3025111.3025121", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Complex event processing (CEP) has proven to be a highly relevant topic in practice. As it is sensitive to both errors in the stream and uncertainty in the pattern, approximate complex event processing (ACEP) is an important direction but has not been adequately studied before. ACEP is costly, and is often performed under insufficient computing resources. We propose an algorithm that learns from the past behavior of ACEP runs, and makes decisions on what to process first in an online manner, so as to maximize the number of full matches found. In addition, we devise effective optimization techniques. Finally, we propose a mechanism that uses reinforcement learning to dynamically update the history structure without incurring much overhead. Put together, these techniques drastically improve the fraction of full matches found in resource constrained environments.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Avni:2016:PHT, author = "Hillel Avni and Trevor Brown", title = "Persistent hybrid transactional memory for databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "4", pages = "409--420", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3025111.3025122", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Processors with hardware support for transactional memory (HTM) are rapidly becoming commonplace, and processor manufacturers are currently working on implementing support for upcoming non-volatile memory (NVM) technologies. The combination of HTM and NVM promises to be a natural choice for in-memory database synchronization. However, limitations on the size of hardware transactions and the lack of progress guarantees by modern HTM implementations prevent some applications from obtaining the full benefit of hardware transactional memory. In this paper, we propose a persistent hybrid TM algorithm called PHyTM for systems that support NVM and HTM. PHyTM allows hardware assisted ACID transactions to execute concurrently with pure software transactions, which allows applications to gain the benefit of persistent HTM while simultaneously accommodating unbounded transactions (with a high degree of concurrency). Experimental simulations demonstrate that PHyTM is fast and scalable for realistic workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sun:2016:SOP, author = "Liwen Sun and Michael J. Franklin and Jiannan Wang and Eugene Wu", title = "Skipping-oriented partitioning for columnar layouts", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "4", pages = "421--432", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3025111.3025123", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As data volumes continue to grow, modern database systems increasingly rely on data skipping mechanisms to improve performance by avoiding access to irrelevant data. Recent work [39] proposed a fine-grained partitioning scheme that was shown to improve the opportunities for data skipping in row-oriented systems. Modern analytics and big data systems increasingly adopt columnar storage schemes, and in such systems, a row-based approach misses important opportunities for further improving data skipping. The flexibility of column-oriented organizations, however, comes with the additional cost of tuple reconstruction. In this paper, we develop Generalized Skipping-Oriented Partitioning (GSOP), a novel hybrid data skipping framework that takes into account these row-based and column-based tradeoffs. In contrast to previous column-oriented physical design work, GSOP considers the tradeoffs between horizontal data skipping and vertical partitioning jointly. Our experiments using two public benchmarks and a real-world workload show that GSOP can significantly reduce the amount of data scanned and improve end-to-end query response times over the state-of-the- art techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Singh:2016:EQU, author = "Sneha Aman Singh and Divesh Srivastava and Srikanta Tirthapura", title = "Estimating quantiles from the union of historical and streaming data", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "4", pages = "433--444", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3025111.3025124", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern enterprises generate huge amounts of streaming data, for example, micro-blog feeds, financial data, network monitoring and industrial application monitoring. While Data Stream Management Systems have proven successful in providing support for real-time alerting, many applications, such as network monitoring for intrusion detection and real-time bidding, require complex analytics over historical and real-time data over the data streams. We present a new method to process one of the most fundamental analytical primitives, quantile queries, on the union of historical and streaming data. Our method combines an index on historical data with a memory-efficient sketch on streaming data to answer quantile queries with accuracy-resource tradeoffs that are significantly better than current solutions that are based solely on disk-resident indexes or solely on streaming algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Serafini:2016:CFG, author = "Marco Serafini and Rebecca Taft and Aaron J. Elmore and Andrew Pavlo and Ashraf Aboulnaga and Michael Stonebraker", title = "{Clay}: fine-grained adaptive partitioning for general database schemas", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "4", pages = "445--456", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3025111.3025125", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Transaction processing database management systems (DBMSs) are critical for today's data-intensive applications because they enable an organization to quickly ingest and query new information. Many of these applications exceed the capabilities of a single server, and thus their database has to be deployed in a distributed DBMS. The key factor affecting such a system's performance is how the database is partitioned. If the database is partitioned incorrectly, the number of distributed transactions can be high. These transactions have to synchronize their operations over the network, which is considerably slower and leads to poor performance. Previous work on elastic database repartitioning has focused on a certain class of applications whose database schema can be represented in a hierarchical tree structure. But many applications cannot be partitioned in this manner, and thus are subject to distributed transactions that impede their performance and scalability. In this paper, we present a new on-line partitioning approach, called Clay, that supports both tree-based schemas and more complex ``general'' schemas with arbitrary foreign key relationships. Clay dynamically creates blocks of tuples to migrate among servers during repartitioning, placing no constraints on the schema but taking care to balance load and reduce the amount of data migrated. Clay achieves this goal by including in each block a set of hot tuples and other tuples co-accessed with these hot tuples. To evaluate our approach, we integrate Clay in a distributed, main-memory DBMS and show that it can generate partitioning schemes that enable the system to achieve up to 15$ \times $ better throughput and 99\% lower latency than existing approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Siddiqui:2016:EDE, author = "Tarique Siddiqui and Albert Kim and John Lee and Karrie Karahalios and Aditya Parameswaran", title = "Effortless data exploration with zenvisage: an expressive and interactive visual analytics system", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "4", pages = "457--468", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3025111.3025126", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data visualization is by far the most commonly used mechanism to explore and extract insights from datasets, especially by novice data scientists. And yet, current visual analytics tools are rather limited in their ability to operate on collections of visualizations---by composing, filtering, comparing, and sorting them---to find those that depict desired trends or patterns. The process of visual data exploration remains a tedious process of trial-and-error. We propose zenvisage, a visual analytics platform for effortlessly finding desired visual patterns from large datasets. We introduce zenvisage's general purpose visual exploration language, ZQL (``zee-quel'') for specifying the desired visual patterns, drawing from use-cases in a variety of domains, including biology, mechanical engineering, climate science, and commerce. We formalize the expressiveness of ZQL via a visual exploration algebra---an algebra on collections of visualizations---and demonstrate that ZQL is as expressive as that algebra. zenvisage exposes an interactive front-end that supports the issuing of ZQL queries, and also supports interactions that are ``short-cuts'' to certain commonly used ZQL queries. To execute these queries, zenvisage uses a novel ZQL graph-based query optimizer that leverages a suite of optimizations tailored to the goal of processing collections of visualizations in certain pre-defined ways. Lastly, a user survey and study demonstrates that data scientists are able to effectively use zenvisage to eliminate error-prone and tedious exploration and directly identify desired visualizations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ceccarello:2017:MSA, author = "Matteo Ceccarello and Andrea Pietracaprina and Geppino Pucci and Eli Upfal", title = "{MapReduce} and streaming algorithms for diversity maximization in metric spaces of bounded doubling dimension", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "5", pages = "469--480", month = jan, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a dataset of points in a metric space and an integer $k$, a diversity maximization problem requires determining a subset of $k$ points maximizing some diversity objective measure, e.g., the minimum or the average distance between two points in the subset. Diversity maximization is computationally hard, hence only approximate solutions can be hoped for. Although its applications are mainly in massive data analysis, most of the past research on diversity maximization focused on the sequential setting. In this work we present space and pass/round-efficient diversity maximization algorithms for the Streaming and MapReduce models and analyze their approximation guarantees for the relevant class of metric spaces of bounded doubling dimension. Like other approaches in the literature, our algorithms rely on the determination of high-quality core-sets, i.e., (much) smaller subsets of the input which contain good approximations to the optimal solution for the whole input. For a variety of diversity objective functions, our algorithms attain an $ (\alpha + \epsilon)$-approximation ratio, for any constant $ \epsilon > 0$, where $ \alpha $ is the best approximation ratio achieved by a polynomial-time, linear-space sequential algorithm for the same diversity objective. This improves substantially over the approximation ratios attainable in Streaming and MapReduce by state-of-the-art algorithms for general metric spaces. We provide extensive experimental evidence of the effectiveness of our algorithms on both real world and synthetic datasets, scaling up to over a billion points.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bindschaedler:2017:PDP, author = "Vincent Bindschaedler and Reza Shokri and Carl A. Gunter", title = "Plausible deniability for privacy-preserving data synthesis", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "5", pages = "481--492", month = jan, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Releasing full data records is one of the most challenging problems in data privacy. On the one hand, many of the popular techniques such as data de-identification are problematic because of their dependence on the background knowledge of adversaries. On the other hand, rigorous methods such as the exponential mechanism for differential privacy are often computationally impractical to use for releasing high dimensional data or cannot preserve high utility of original data due to their extensive data perturbation. This paper presents a criterion called plausible deniability that provides a formal privacy guarantee, notably for releasing sensitive datasets: an output record can be released only if a certain amount of input records are indistinguishable, up to a privacy parameter. This notion does not depend on the background knowledge of an adversary. Also, it can efficiently be checked by privacy tests. We present mechanisms to generate synthetic datasets with similar statistical properties to the input data and the same format. We study this technique both theoretically and experimentally. A key theoretical result shows that, with proper randomization, the plausible deniability mechanism generates differentially private synthetic data. We demonstrate the efficiency of this generative technique on a large dataset; it is shown to preserve the utility of original data with respect to various statistical analysis and machine learning measures.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Verma:2017:ECP, author = "Shiv Verma and Luke M. Leslie and Yosub Shin and Indranil Gupta", title = "An experimental comparison of partitioning strategies in distributed graph processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "5", pages = "493--504", month = jan, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we study the problem of choosing among partitioning strategies in distributed graph processing systems. To this end, we evaluate and characterize both the performance and resource usage of different partitioning strategies under various popular distributed graph processing systems, applications, input graphs, and execution environments. Through our experiments, we found that no single partitioning strategy is the best fit for all situations, and that the choice of partitioning strategy has a significant effect on resource usage and application run-time. Our experiments demonstrate that the choice of partitioning strategy depends on (1) the degree distribution of input graph, (2) the type and duration of the application, and (3) the cluster size. Based on our results, we present rules of thumb to help users pick the best partitioning strategy for their particular use cases. We present results from each system, as well as from all partitioning strategies implemented in one common system (PowerLyra).", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chandramouli:2017:SPR, author = "Badrish Chandramouli and Jonathan Goldstein", title = "{Shrink}: prescribing resiliency solutions for streaming", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "5", pages = "505--516", month = jan, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Streaming query deployments make up a vital part of cloud oriented applications. They vary widely in their data, logic, and statefulness, and are typically executed in multi-tenant distributed environments with varying uptime SLAs. In order to achieve these SLAs, one of a number of proposed resiliency strategies is employed to protect against failure. This paper has introduced the first, comprehensive, cloud friendly comparison between different resiliency techniques for streaming queries. In this paper, we introduce models which capture the costs associated with different resiliency strategies, and through a series of experiments which implement and validate these models, show that (1) there is no single resiliency strategy which efficiently handles most streaming scenarios; (2) the optimization space is too complex for a person to employ a ``rules of thumb'' approach; and (3) there exists a clear generalization of periodic checkpointing that is worth considering in many cases. Finally, the models presented in this paper can be adapted to fit a wide variety of resiliency strategies, and likely have important consequences for cloud services beyond those that are obviously streaming.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Barthels:2017:DJA, author = "Claude Barthels and Ingo M{\"u}ller and Timo Schneider and Gustavo Alonso and Torsten Hoefler", title = "Distributed join algorithms on thousands of cores", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "5", pages = "517--528", month = jan, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Traditional database operators such as joins are relevant not only in the context of database engines but also as a building block in many computational and machine learning algorithms. With the advent of big data, there is an increasing demand for efficient join algorithms that can scale with the input data size and the available hardware resources. In this paper, we explore the implementation of distributed join algorithms in systems with several thousand cores connected by a low-latency network as used in high performance computing systems or data centers. We compare radix hash join to sort-merge join algorithms and discuss their implementation at this scale. In the paper, we explain how to use MPI to implement joins, show the impact and advantages of RDMA, discuss the importance of network scheduling, and study the relative performance of sorting vs. hashing. The experimental results show that the algorithms we present scale well with the number of cores, reaching a throughput of 48.7 billion input tuples per second on 4,096 cores.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2017:CBS, author = "Junling Liu and Ke Deng and Huanliang Sun and Yu Ge and Xiaofang Zhou and Christian S. Jensen", title = "Clue-based spatio-textual query", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "5", pages = "529--540", month = jan, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Along with the proliferation of online digital map and location-based service, very large POI (point of interest) databases have been constructed where a record corresponds to a POI with information including name, category, address, geographical location and other features. A basic spatial query in POI database is POI retrieval. In many scenarios, a user cannot provide enough information to pinpoint the POI except some clue. For example, a user wants to identify a caf {\'e} in a city visited many years ago. SHe cannot remember the name and address but she still recalls that ``the caf {\'e} is about 200 meters away from a restaurant; and turning left at the restaurant there is a bakery 500 meters away, etc.''. Intuitively, the clue, even partial and approximate, describes the spatio-textual context around the targeted POI. Motivated by this observation, this work investigates clue-based spatio-textual query which allows user providing clue, i.e., some nearby POIs and the spatial relationships between them, in POI retrieval. The objective is to retrieve k POIs from a POI database with the highest spatio-textual context similarities against the clue. This work has deliberately designed data-quality-tolerant spatio-textual context similarity metric to cope with various data quality problems in both the clue and the POI database. Through crossing valuation, the query accuracy is further enhanced by ensemble method. Also, this work has developed an index called roll-out-star R-tree (RSR-tree) to dramatically improve the query processing efficiency. The extensive tests on data sets from the real world have verified the superiority of our methods in all aspects.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zheng:2017:TIC, author = "Yudian Zheng and Guoliang Li and Yuanbing Li and Caihua Shan and Reynold Cheng", title = "Truth inference in crowdsourcing: is the problem solved?", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "5", pages = "541--552", month = jan, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Crowdsourcing has emerged as a novel problem-solving paradigm, which facilitates addressing problems that are hard for computers, e.g., entity resolution and sentiment analysis. However, due to the openness of crowdsourcing, workers may yield low-quality answers, and a redundancy-based method is widely employed, which first assigns each task to multiple workers and then infers the correct answer (called truth) for the task based on the answers of the assigned workers. A fundamental problem in this method is Truth Inference, which decides how to effectively infer the truth. Recently, the database community and data mining community independently study this problem and propose various algorithms. However, these algorithms are not compared extensively under the same framework and it is hard for practitioners to select appropriate algorithms. To alleviate this problem, we provide a detailed survey on 17 existing algorithms and perform a comprehensive evaluation using 5 real datasets. We make all codes and datasets public for future research. Through experiments we find that existing algorithms are not stable across different datasets and there is no algorithm that outperforms others consistently. We believe that the truth inference problem is not fully solved, and identify the limitations of existing algorithms and point out promising research directions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Harding:2017:EDC, author = "Rachael Harding and Dana {Van Aken} and Andrew Pavlo and Michael Stonebraker", title = "An evaluation of distributed concurrency control", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "5", pages = "553--564", month = jan, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Increasing transaction volumes have led to a resurgence of interest in distributed transaction processing. In particular, partitioning data across several servers can improve throughput by allowing servers to process transactions in parallel. But executing transactions across servers limits the scalability and performance of these systems. In this paper, we quantify the effects of distribution on concurrency control protocols in a distributed environment. We evaluate six classic and modern protocols in an in-memory distributed database evaluation framework called Deneva, providing an apples-to-apples comparison between each. Our results expose severe limitations of distributed transaction processing engines. Moreover, in our analysis, we identify several protocol-specific scalability bottlenecks. We conclude that to achieve truly scalable operation, distributed concurrency control solutions must seek a tighter coupling with either novel network hardware (in the local area) or applications (via data modeling and semantically-aware execution), or both.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cui:2017:KLQ, author = "Wanyun Cui and Yanghua Xiao and Haixun Wang and Yangqiu Song and Seung-won Hwang and Wei Wang", title = "{KBQA}: learning question answering over {QA} corpora and knowledge bases", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "5", pages = "565--576", month = jan, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Question answering (QA) has become a popular way for humans to access billion-scale knowledge bases. Unlike web search, QA over a knowledge base gives out accurate and concise results, provided that natural language questions can be understood and mapped precisely to structured queries over the knowledge base. The challenge, however, is that a human can ask one question in many different ways. Previous approaches have natural limits due to their representations: rule based approaches only understand a small set of ``canned'' questions, while keyword based or synonym based approaches cannot fully understand the questions. In this paper, we design a new kind of question representation: templates, over a billion scale knowledge base and a million scale QA corpora. For example, for questions about a city's population, we learn templates such as What's the population of city?, How many people are there in city?. We learned 27 million templates for 2782 intents. Based on these templates, our QA system KBQA effectively supports binary factoid questions, as well as complex questions which are composed of a series of binary factoid questions. Furthermore, we expand predicates in RDF knowledge base, which boosts the coverage of knowledge base by 57 times. Our QA system beats all other state-of-art works on both effectiveness and efficiency over QALD benchmarks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deutch:2017:PNL, author = "Daniel Deutch and Nave Frost and Amir Gilad", title = "Provenance for natural language queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "5", pages = "577--588", month = jan, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Multiple lines of research have developed Natural Language (NL) interfaces for formulating database queries. We build upon this work, but focus on presenting a highly detailed form of the answers in NL. The answers that we present are importantly based on the provenance of tuples in the query result, detailing not only the results but also their explanations. We develop a novel method for transforming provenance information to NL, by leveraging the original NL query structure. Furthermore, since provenance information is typically large and complex, we present two solutions for its effective presentation as NL text: one that is based on provenance factorization, with novel desiderata relevant to the NL case, and one that is based on summarization. We have implemented our solution in an end-to-end system supporting questions, answers and provenance, all expressed in NL. Our experiments, including a user study, indicate the quality of our solution and its scalability.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lu:2017:AAP, author = "Yi Lu and Anil Shanbhag and Alekh Jindal and Samuel Madden", title = "{AdaptDB}: adaptive partitioning for distributed joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "5", pages = "589--600", month = jan, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Big data analytics often involves complex join queries over two or more tables. Such join processing is expensive in a distributed setting both because large amounts of data must be read from disk, and because of data shuffling across the network. Many techniques based on data partitioning have been proposed to reduce the amount of data that must be accessed, often focusing on finding the best partitioning scheme for a particular workload, rather than adapting to changes in the workload over time. In this paper, we present AdaptDB, an adaptive storage manager for analytical database workloads in a distributed setting. It works by partitioning datasets across a cluster and incrementally refining data partitioning as queries are run. AdaptDB introduces a novel hyper-join that avoids expensive data shuffling by identifying storage blocks of the joining tables that overlap on the join attribute, and only joining those blocks. Hyper-join performs well when each block in one table overlaps with few blocks in the other table, since that will minimize the number of blocks that have to be accessed. To minimize the number of overlapping blocks for common join queries, AdaptDB users smooth repartitioning to repartition small portions of the tables on join attributes as queries run. A prototype of AdaptDB running on top of Spark improves query performance by 2--3x on TPC-H as well as real-world dataset, versus a system that employs scans and shuffle-joins.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2017:EES, author = "Zhipeng Zhang and Yingxia Shao and Bin Cui and Ce Zhang", title = "An experimental evaluation of {SimRank}-based similarity search algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "5", pages = "601--612", month = jan, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a graph, SimRank is one of the most popular measures of the similarity between two vertices. We focus on efficiently calculating SimRank, which has been studied intensively over the last decade. This has led to many algorithms that efficiently calculate or approximate SimRank being proposed by researchers. Despite these abundant research efforts, there is no systematic comparison of these algorithms. In this paper, we conduct a study to compare these algorithms to understand their pros and cons. We first introduce a taxonomy for different algorithms that calculate SimRank and classify each algorithm into one of the following three classes, namely, iterative-, non-iterative-, and random walk-based method. We implement ten algorithms published from 2002 to 2015, and compare them using synthetic and real-world graphs. To ensure the fairness of our study, our implementations use the same data structure and execution framework, and we try our best to optimize each of these algorithms. Our study reveals that none of these algorithms dominates the others: algorithms based on iterative method often have higher accuracy while algorithms based on random walk can be more scalable. One noniterative algorithm has good effectiveness and efficiency on graphs with medium size. Thus, depending on the requirements of different applications, the optimal choice of algorithms differs. This paper provides an empirical guideline for making such choices.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Faleiro:2017:HPT, author = "Jose M. Faleiro and Daniel J. Abadi and Joseph M. Hellerstein", title = "High performance transactions via early write visibility", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "5", pages = "613--624", month = jan, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In order to guarantee recoverable transaction execution, database systems permit a transaction's writes to be observable only at the end of its execution. As a consequence, there is generally a delay between the time a transaction performs a write and the time later transactions are permitted to read it. This delayed write visibility can significantly impact the performance of serializable database systems by reducing concurrency among conflicting transactions. This paper makes the observation that delayed write visibility stems from the fact that database systems can arbitrarily abort transactions at any point during their execution. Accordingly, we make the case for database systems which only abort transactions under a restricted set of conditions, thereby enabling a new recoverability mechanism, early write visibility, which safely makes transactions' writes visible prior to the end of their execution. We design a new serializable concurrency control protocol, piece-wise visibility (PWV), with the explicit goal of enabling early write visibility. We evaluate PWV against state-of-the-art serializable protocols and a highly optimized implementation of read committed, and find that PWV can outperform serializable protocols by an order of magnitude and read committed by 3X on high contention workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Eswaran:2017:ZBP, author = "Dhivya Eswaran and Stephan G{\"u}nnemann and Christos Faloutsos and Disha Makhija and Mohit Kumar", title = "{ZooBP}: belief propagation for heterogeneous networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "5", pages = "625--636", month = jan, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a heterogeneous network, with nodes of different types --- e.g., products, users and sellers from an online recommendation site like Amazon --- and labels for a few nodes ('honest', 'suspicious', etc), can we find a closed formula for Belief Propagation (BP), exact or approximate? Can we say whether it will converge? BP, traditionally an inference algorithm for graphical models, exploits so-called ``network effects'' to perform graph classification tasks when labels for a subset of nodes are provided; and it has been successful in numerous settings like fraudulent entity detection in online retailers and classification in social networks. However, it does not have a closed-form nor does it provide convergence guarantees in general. We propose ZooBP, a method to perform fast BP on undirected heterogeneous graphs with provable convergence guarantees. ZooBP has the following advantages: (1) Generality: It works on heterogeneous graphs with multiple types of nodes and edges; (2) Closed-form solution: ZooBP gives a closed-form solution as well as convergence guarantees; (3) Scalability: ZooBP is linear on the graph size and is up to 600$ \times $ faster than BP, running on graphs with 3.3 million edges in a few seconds. (4) Effectiveness: Applied on real data (a Flipkart e-commerce network with users, products and sellers), ZooBP identifies fraudulent users with a near-perfect precision of 92.3 \% over the top 300 results.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lyu:2017:USV, author = "Min Lyu and Dong Su and Ninghui Li", title = "Understanding the sparse vector technique for differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "6", pages = "637--648", month = feb, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Sparse Vector Technique (SVT) is a fundamental technique for satisfying differential privacy and has the unique quality that one can output some query answers without apparently paying any privacy cost. SVT has been used in both the interactive setting, where one tries to answer a sequence of queries that are not known ahead of the time, and in the non-interactive setting, where all queries are known. Because of the potential savings on privacy budget, many variants for SVT have been proposed and employed in privacy-preserving data mining and publishing. However, most variants of SVT are actually not private. In this paper, we analyze these errors and identify the misunderstandings that likely contribute to them. We also propose a new version of SVT that provides better utility, and introduce an effective technique to improve the performance of SVT. These enhancements can be applied to improve utility in the interactive setting. Through both analytical and experimental comparisons, we show that, in the non-interactive setting (but not the interactive setting), the SVT technique is unnecessary, as it can be replaced by the Exponential Mechanism (EM) with better accuracy.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2017:OEA, author = "Fan Zhang and Wenjie Zhang and Ying Zhang and Lu Qin and Xuemin Lin", title = "{OLAK}: an efficient algorithm to prevent unraveling in social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "6", pages = "649--660", month = feb, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we study the problem of the anchored $k$-core. Given a graph $G$, an integer $k$ and a budget $b$, we aim to identify $b$ vertices in $G$ so that we can determine the largest induced subgraph $J$ in which every vertex, except the $b$ vertices, has at least $k$ neighbors in $J$. This problem was introduced by Bhawalkar and Kleinberg et al. in the context of user engagement in social networks, where a user may leave a community if he/she has less than $k$ friends engaged. The problem has been shown to be NP-hard and inapproximable. A polynomial-time algorithm for graphs with bounded tree-width has been proposed. However, this assumption usually does not hold in real-life graphs, and their techniques cannot be extended to handle general graphs. Motivated by this, we propose an efficient algorithm, namely onion-layer based anchored $k$-core (OLAK), for the anchored $k$ core problem on large scale graphs. To facilitate computation of the anchored $k$-core, we design an onion layer structure, which is generated by a simple onion-peeling-like algorithm against a small set of vertices in the graph. We show that computation of the best anchor can simply be conducted upon the vertices on the onion layers, which significantly reduces the search space. Based on the well-organized layer structure, we develop efficient candidates exploration, early termination and pruning techniques to further speed up computation. Comprehensive experiments on 10 real-life graphs demonstrate the effectiveness and efficiency of our proposed methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Khan:2017:DTI, author = "Meraj Khan and Larry Xu and Arnab Nandi and Joseph M. Hellerstein", title = "Data tweening: incremental visualization of data transforms", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "6", pages = "661--672", month = feb, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the context of interactive query sessions, it is common to issue a succession of queries, transforming a dataset to the desired result. It is often difficult to comprehend a succession of transformations, especially for complex queries. Thus, to facilitate understanding of each data transformation and to provide continuous feedback, we introduce the concept of ``data tweening'', i.e., interpolating between resultsets, presenting to the user a series of incremental visual representations of a resultset transformation. We present tweening methods that consider not just the changes in the result, but also the changes in the query. Through user studies, we show that data tweening allows users to efficiently comprehend data transforms, and also enables them to gain a better understanding of the underlying query operations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bater:2017:SSQ, author = "Johes Bater and Gregory Elliott and Craig Eggen and Satyender Goel and Abel Kho and Jennie Rogers", title = "{SMCQL}: secure querying for federated databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "6", pages = "673--684", month = feb, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "People and machines are collecting data at an unprecedented rate. Despite this newfound abundance of data, progress has been slow in sharing it for open science, business, and other data-intensive endeavors. Many such efforts are stymied by privacy concerns and regulatory compliance issues. For example, many hospitals are interested in pooling their medical records for research, but none may disclose arbitrary patient records to researchers or other healthcare providers. In this context we propose the Private Data Network (PDN), a federated database for querying over the collective data of mutually distrustful parties. In a PDN, each member database does not reveal its tuples to its peers nor to the query writer. Instead, the user submits a query to an honest broker that plans and coordinates its execution over multiple private databases using secure multiparty computation (SMC). Here, each database's query execution is oblivious, and its program counters and memory traces are agnostic to the inputs of others. We introduce a framework for executing PDN queries named smcql. This system translates SQL statements into SMC primitives to compute query results over the union of its source databases without revealing sensitive information about individual tuples to peer data providers or the honest broker. Only the honest broker and the querier receive the results of a PDN query. For fast, secure query evaluation, we explore a heuristics-driven optimizer that minimizes the PDN's use of secure computation and partitions its query evaluation into scalable slices.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zamanian:2017:EMD, author = "Erfan Zamanian and Carsten Binnig and Tim Harris and Tim Kraska", title = "The end of a myth: distributed transactions can scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "6", pages = "685--696", month = feb, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The common wisdom is that distributed transactions do not scale. But what if distributed transactions could be made scalable using the next generation of networks and a redesign of distributed databases? There would no longer be a need for developers to worry about co-partitioning schemes to achieve decent performance. Application development would become easier as data placement would no longer determine how scalable an application is. Hardware provisioning would be simplified as the system administrator can expect a linear scale-out when adding more machines rather than some complex sub-linear function, which is highly application specific. In this paper, we present the design of our novel scalable database system NAM-DB and show that distributed transactions with the very common Snapshot Isolation guarantee can indeed scale using the next generation of RDMA-enabled network technology without any inherent bottlenecks. Our experiments with the TPC-C benchmark show that our system scales linearly to over 6.5 million new-order (14.5 million total) distributed transactions per second on 56 machines.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhu:2017:NIG, author = "Haohan Zhu and Xianrui Meng and George Kollios", title = "{NED}: an inter-graph node metric based on edit distance", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "6", pages = "697--708", month = feb, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Node similarity is fundamental in graph analytics. However, node similarity between nodes in different graphs (inter-graph nodes) has not received enough attention yet. The inter-graph node similarity is important in learning a new graph based on the knowledge extracted from an existing graph (transfer learning on graphs) and has applications in biological, communication, and social networks. In this paper, we propose a novel distance function for measuring inter-graph { node} similarity with { edit} { distance}, called NED. In NED, two nodes are compared according to their local neighborhood topologies which are represented as unordered k -adjacent trees, without relying on any extra information. Due to the hardness of computing tree edit distance on unordered trees which is NP-Complete, we propose a modified tree edit distance, called TED*, for comparing unordered and unlabeled k adjacent trees. TED* is a metric distance, as the original tree edit distance, but more importantly, TED* is polynomially computable. As a metric distance, NED admits efficient indexing, provides interpretable results, and shows to perform better than existing approaches on a number of data analysis tasks, including graph deanonymization. Finally, the efficiency and effectiveness of NED are empirically demonstrated using real-world graphs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fang:2017:ECS, author = "Yixiang Fang and Reynold Cheng and Xiaodong Li and Siqiang Luo and Jiafeng Hu", title = "Effective community search over large spatial graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "6", pages = "709--720", month = feb, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Communities are prevalent in social networks, knowledge graphs, and biological networks. Recently, the topic of community search (CS) has received plenty of attention. Given a query vertex, CS looks for a dense subgraph that contains it. Existing CS solutions do not consider the spatial extent of a community. They can yield communities whose locations of vertices span large areas. In applications that facilitate the creation of social events (e.g., finding conference attendees to join a dinner), it is important to find groups of people who are physically close to each other. In this situation, it is desirable to have a spatial-aware community (or SAC), whose vertices are close structurally and spatially. Given a graph G and a query vertex q, we develop exact solutions for finding an SAC that contains q. Since these solutions cannot scale to large datasets, we have further designed three approximation algorithms to compute an SAC. We have performed an experimental evaluation for these solutions on both large real and synthetic datasets. Experimental results show that SAC is better than the communities returned by existing solutions. Moreover, our approximation solutions can find SACs accurately and efficiently.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Szlichta:2017:ECD, author = "Jaros{\l}aw Szlichta and Parke Godfrey and Lukasz Golab and Mehdi Kargar and Divesh Srivastava", title = "Effective and complete discovery of order dependencies via set-based axiomatization", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "7", pages = "721--732", month = mar, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Mar 27 20:45:15 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Integrity constraints (ICs) are useful for query optimization and for expressing and enforcing application semantics. However, formulating constraints manually requires domain expertise, is prone to human errors, and may be excessively time consuming, especially on large datasets. Hence, proposals for automatic discovery have been made for some classes of ICs, such as functional dependencies (FDs), and recently, order dependencies (ODs). ODs properly subsume FDs, as they can additionally express business rules involving order; e.g., an employee never has a higher salary while paying lower taxes than another employee. We present a new OD discovery algorithm enabled by a novel polynomial mapping to a canonical form of ODs, and a sound and complete set of axioms (inference rules) for canonical ODs. Our algorithm has exponential worst-case time complexity, O (2$^{| R |}$ ), in the number of attributes | R | and linear complexity in the number of tuples. We prove that it produces a complete and minimal set of ODs. Using real and synthetic datasets, we experimentally show orders-of-magnitude performance improvements over the prior state-of-the-art.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Karnagel:2017:AWP, author = "Tomas Karnagel and Dirk Habich and Wolfgang Lehner", title = "Adaptive work placement for query processing on heterogeneous computing resources", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "7", pages = "733--744", month = mar, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Mar 27 20:45:15 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The hardware landscape is currently changing from homogeneous multi-core systems towards heterogeneous systems with many different computing units, each with their own characteristics. This trend is a great opportunity for data-base systems to increase the overall performance if the heterogeneous resources can be utilized efficiently. To achieve this, the main challenge is to place the right work on the right computing unit. Current approaches tackling this placement for query processing assume that data cardinalities of intermediate results can be correctly estimated. However, this assumption does not hold for complex queries. To overcome this problem, we propose an adaptive placement approach being independent of cardinality estimation of intermediate results. Our approach is incorporated in a novel adaptive placement sequence. Additionally, we implement our approach as an extensible virtualization layer, to demonstrate the broad applicability with multiple database systems. In our evaluation, we clearly show that our approach significantly improves OLAP query processing on heterogeneous hardware, while being adaptive enough to react to changing cardinalities of intermediate query results.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2017:LFE, author = "Fan Yang and Fanhua Shang and Yuzhen Huang and James Cheng and Jinfeng Li and Yunjian Zhao and Ruihao Zhao", title = "{LFTF}: a framework for efficient tensor analytics at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "7", pages = "745--756", month = mar, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Mar 27 20:45:15 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Tensors are higher order generalizations of matrices to model multi-aspect data, e.g., a set of purchase records with the schema (user\_id, product\_id, timestamp, feedback). Tensor factorization is a powerful technique for generating a model from a tensor, just like matrix factorization generates a model from a matrix, but with higher accuracy and richer information as more attributes are available in a higher- order tensor than a matrix. The data model obtained by tensor factorization can be used for classification, recommendation, anomaly detection, and so on. Though having a broad range of applications, tensor factorization has not been popularly applied compared with matrix factorization that has been widely used in recommender systems, mainly due to the high computational cost and poor scalability of existing tensor factorization methods. Efficient and scalable tensor factorization is particularly challenging because real world tensor data are mostly sparse and massive. In this paper, we propose a novel distributed algorithm, called Lock-Free Tensor Factorization (LFTF), which significantly improves the efficiency and scalability of distributed tensor factorization by exploiting asynchronous execution in a re-formulated problem. Our experiments show that LFTF achieves much higher CPU and network throughput than existing methods, converges at least 17 times faster and scales to much larger datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gupta:2017:LSM, author = "Shalmoli Gupta and Ravi Kumar and Kefu Lu and Benjamin Moseley and Sergei Vassilvitskii", title = "Local search methods for $k$-means with outliers", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "7", pages = "757--768", month = mar, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Mar 27 20:45:15 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the problem of k -means clustering in the presence of outliers. The goal is to cluster a set of data points to minimize the variance of the points assigned to the same cluster, with the freedom of ignoring a small set of data points that can be labeled as outliers. Clustering with outliers has received a lot of attention in the data processing community, but practical, efficient, and provably good algorithms remain unknown for the most popular k -means objective. Our work proposes a simple local search-based algorithm for k -means clustering with outliers. We prove that this algorithm achieves constant-factor approximate solutions and can be combined with known sketching techniques to scale to large data sets. Using empirical evaluation on both synthetic and large-scale real-world data, we demonstrate that the algorithm dominates recently proposed heuristic approaches for the problem.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Casanova:2017:DTR, author = "Guillaume Casanova and Elias Englmeier and Michael E. Houle and Peer Kr{\"o}ger and Michael Nett and Erich Schubert and Arthur Zimek", title = "Dimensional testing for reverse $k$-nearest neighbor search", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "7", pages = "769--780", month = mar, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Mar 27 20:45:15 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a query object q, reverse k -nearest neighbor (R k NN) search aims to locate those objects of the database that have q among their k -nearest neighbors. In this paper, we propose an approximation method for solving R k NN queries, where the pruning operations and termination tests are guided by a characterization of the intrinsic dimensionality of the data. The method can accommodate any index structure supporting incremental (forward) nearest-neighbor search for the generation and verification of candidates, while avoiding impractically-high preprocessing costs. We also provide experimental evidence that our method significantly outperforms its competitors in terms of the tradeoff between execution time and the quality of the approximation. Our approach thus addresses many of the scalability issues surrounding the use of previous methods in data mining.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2017:EEM, author = "Yingjun Wu and Joy Arulraj and Jiexi Lin and Ran Xian and Andrew Pavlo", title = "An empirical evaluation of in-memory multi-version concurrency control", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "7", pages = "781--792", month = mar, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Mar 27 20:45:15 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Multi-version concurrency control (MVCC) is currently the most popular transaction management scheme in modern database management systems (DBMSs). Although MVCC was discovered in the late 1970s, it is used in almost every major relational DBMS released in the last decade. Maintaining multiple versions of data potentially increases parallelism without sacrificing serializability when processing transactions. But scaling MVCC in a multi-core and in-memory setting is non-trivial: when there are a large number of threads running in parallel, the synchronization overhead can outweigh the benefits of multi-versioning. To understand how MVCC perform when processing transactions in modern hardware settings, we conduct an extensive study of the scheme's four key design decisions: concurrency control protocol, version storage, garbage collection, and index management. We implemented state-of-the-art variants of all of these in an in-memory DBMS and evaluated them using OLTP workloads. Our analysis identifies the fundamental bottlenecks of each design choice.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2017:FDH, author = "You Wu and Junyang Gao and Pankaj K. Agarwal and Jun Yang", title = "Finding diverse, high-value representatives on a surface of answers", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "7", pages = "793--804", month = mar, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Mar 27 20:45:15 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In many applications, the system needs to selectively present a small subset of answers to users. The set of all possible answers can be seen as an elevation surface over a domain, where the elevation measures the quality of each answer, and the dimensions of the domain correspond to attributes of the answers with which similarity between answers can be measured. This paper considers the problem of finding a diverse set of k high-quality representatives for such a surface. We show that existing methods for diversified top- k and weighted clustering problems are inadequate for this problem. We propose k -DHR as a better formulation for the problem. We show that k -DHR has a submodular and monotone objective function, and we develop efficient algorithms for solving k -DHR with provable guarantees. We conduct extensive experiments to demonstrate the usefulness of the results produced by k -DHR for applications in computational lead-finding and fact-checking, as well as the efficiency and effectiveness of our algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2017:RTI, author = "Yanhao Wang and Qi Fan and Yuchen Li and Kian-Lee Tan", title = "Real-time influence maximization on dynamic social streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "7", pages = "805--816", month = mar, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Mar 27 20:45:15 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Influence maximization (IM), which selects a set of $k$ users (called seeds) to maximize the influence spread over a social network, is a fundamental problem in a wide range of applications such as viral marketing and network monitoring. Existing IM solutions fail to consider the highly dynamic nature of social influence, which results in either poor seed qualities or long processing time when the network evolves. To address this problem, we define a novel IM query named Stream Influence Maximization (SIM) on social streams. Technically, SIM adopts the sliding window model and maintains a set of $k$ seeds with the largest influence value over the most recent social actions. Next, we propose the Influential Checkpoints (IC) framework to facilitate continuous SIM query processing. The IC framework creates a checkpoint for each window shift and ensures an $ \epsilon $-approximate solution. To improve its efficiency, we further devise a Sparse Influential Checkpoints (SIC) framework which selectively keeps $ O(l o g N / \beta)$ checkpoints for a sliding window of size $N$ and maintains an $ \epsilon (1 \beta) / 2$-approximate solution. Experimental results on both real-world and synthetic datasets confirm the effectiveness and efficiency of our proposed frameworks against the state-of-the-art IM approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cai:2017:CDC, author = "Hongyun Cai and Vincent W. Zheng and Fanwei Zhu and Kevin Chen-Chuan Chang and Zi Huang", title = "From community detection to community profiling", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "7", pages = "817--828", month = mar, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Mar 27 20:45:15 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Most existing community-related studies focus on detection, which aim to find the community membership for each user from user friendship links. However, membership alone, without a complete profile of what a community is and how it interacts with other communities, has limited applications. This motivates us to consider systematically profiling the communities and thereby developing useful community-level applications. In this paper, we for the first time formalize the concept of community profiling. With rich user information on the network, such as user published content and user diffusion links, we characterize a community in terms of both its internal content profile and external diffusion profile. The difficulty of community profiling is often underestimated. We novelly identify three unique challenges and propose a joint Community Profiling and Detection (CPD) model to address them accordingly. We also contribute a scalable inference algorithm, which scales linearly with the data size and it is easily parallelizable. We evaluate CPD on large-scale real-world data sets, and show that it is significantly better than the state-of-the-art baselines in various tasks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jain:2017:UWD, author = "Ayush Jain and Akash Das Sarma and Aditya Parameswaran and Jennifer Widom", title = "Understanding workers, developing effective tasks, and enhancing marketplace dynamics: a study of a large crowdsourcing marketplace", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "7", pages = "829--840", month = mar, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Mar 27 20:45:15 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We conduct an experimental analysis of a dataset comprising over 27 million microtasks performed by over 70,000 workers issued to a large crowdsourcing marketplace between 2012--2016. Using this data---never before analyzed in an academic context---we shed light on three crucial aspects of crowdsourcing: (1) Task design---helping requesters understand what constitutes an effective task, and how to go about designing one; (2) Marketplace dynamics --- helping marketplace administrators and designers understand the interaction between tasks and workers, and the corresponding marketplace load; and (3) Worker behavior --- understanding worker attention spans, lifetimes, and general behavior, for the improvement of the crowdsourcing ecosystem as a whole.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lin:2017:OPE, author = "Xuelian Lin and Shuai Ma and Han Zhang and Tianyu Wo and Jinpeng Huai", title = "One-pass error bounded trajectory simplification", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "7", pages = "841--852", month = mar, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Mar 27 20:45:15 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Nowadays, various sensors are collecting, storing and transmitting tremendous trajectory data, and it is known that raw trajectory data seriously wastes the storage, network band and computing resource. Line simplification (LS) algorithms are an effective approach to attacking this issue by compressing data points in a trajectory to a set of continuous line segments, and are commonly used in practice. However, existing LS algorithms are not sufficient for the needs of sensors in mobile devices. In this study, we first develop a one-pass error bounded trajectory simplification algorithm (OPERB), which scans each data point in a trajectory once and only once. We then propose an aggressive one-pass error bounded trajectory simplification algorithm (OPERB-A), which allows interpolating new data points into a trajectory under certain conditions. Finally, we experimentally verify that our approaches (OPERB and OPERB-A) are both efficient and effective, using four real-life trajectory datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2017:MIL, author = "Jianguo Wang and Chunbin Lin and Ruining He and Moojin Chae and Yannis Papakonstantinou and Steven Swanson", title = "{MILC}: inverted list compression in memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "8", pages = "853--864", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3090163.3090164", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Inverted list compression is a topic that has been studied for 50 years due to its fundamental importance in numerous applications including information retrieval, databases, and graph analytics. Typically, an inverted list compression algorithm is evaluated on its space overhead and query processing time. Earlier list compression designs mainly focused on minimizing the space overhead to reduce expensive disk I/O time in disk-oriented systems. But the recent trend is shifted towards reducing query processing time because the underlying systems tend to be memory-resident. Although there are many highly optimized compression approaches in main memory, there is still a considerable performance gap between query processing over compressed lists and uncompressed lists, which motivates this work. In this work, we set out to bridge this performance gap for the first time by proposing a new compression scheme, namely, MILC (memory inverted list compression). MILC relies on a series of techniques including offset-oriented fixed-bit encoding, dynamic partitioning, in-block compression, cache-aware optimization, and SIMD acceleration. We conduct experiments on three real-world datasets in information retrieval, databases, and graph analytics to demonstrate the high performance and low space overhead of MILC. We compare MILC with 12 recent compression algorithms and experimentally show that MILC improves the query performance by up to 13.2$ \times $ and reduces the space overhead by up to 4.7$ \times $.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2017:CDD, author = "Botong Huang and Jun Yang", title = "{C{\"u}m{\"u}l{\"o}n--D}: data analytics in a dynamic spot market", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "8", pages = "865--876", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3090163.3090165", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present a system called C{\"u}m{\"u}l{\"o}n-D for matrix-based data analysis in a spot market of a public cloud. Prices in such markets fluctuate over time: while users can acquire machines usually at a very low bid price, the cloud can terminate these machines as soon as the market price exceeds their bid price. The distinguishing features of C{\"u}m{\"u}l{\"o}n-D include its continuous, proactive adaptation to the changing market, and its ability to quantify and control the monetary risk involved in paying for a workflow execution. We solve the dynamic optimization problem in a principled manner with a Markov decision process, and account for practical details that are often ignored previously but nonetheless important to performance. We evaluate C{\"u}m{\"u}l{\"o}n-D's effectiveness and advantages over previous approaches with experiments on Amazon EC2.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Then:2017:AAT, author = "Manuel Then and Timo Kersten and Stephan G{\"u}nnemann and Alfons Kemper and Thomas Neumann", title = "Automatic algorithm transformation for efficient multi-snapshot analytics on temporal graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "8", pages = "877--888", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3090163.3090166", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Analytical graph algorithms commonly compute metrics for a graph at one point in time. In practice it is often also of interest how metrics change over time, e.g., to find trends. For this purpose, algorithms must be executed for multiple graph snapshots. We present Single Algorithm Multiple Snapshots (SAMS), a novel approach to execute algorithms concurrently for multiple graph snapshots. SAMS automatically transforms graph algorithms to leverage similarities between the analyzed graph snapshots. The automatic transformation interleaves algorithm executions on multiple snapshots, synergistically shares their graph accesses and traversals, and optimizes the algorithm's data layout. Thus, SAMS can amortize the cost of random data accesses and improve memory bandwidth utilization---two main cost factors in graph analytics. We extensively evaluate SAMS using six well-known algorithms and multiple synthetic as well as real-world graph datasets. Our measurements show that in multi-snapshot analyses, SAMS offers runtime improvements of up to two orders of magnitude over traditional snapshot-at-a-time execution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhu:2017:LAM, author = "Jianqiao Zhu and Navneet Potti and Saket Saurabh and Jignesh M. Patel", title = "Looking ahead makes query plans robust: making the initial case with in-memory star schema data warehouse workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "8", pages = "889--900", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3090163.3090167", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query optimizers and query execution engines cooperate to deliver high performance on complex analytic queries. Typically, the optimizer searches through the plan space and sends a selected plan to the execution engine. However, optimizers may at times miss the optimal plan, with sometimes disastrous impact on performance. In this paper, we develop the notion of robustness of a query evaluation strategy with respect to a space of query plans. We also propose a novel query execution strategy called Lookahead Information Passing (LIP) that is robust with respect to the space of (fully pipeline-able) left-deep query plan trees for in-memory star schema data warehouses. LIP ensures that execution times for the best and the worst case plans are far closer than without LIP. In fact, under certain assumptions of independent and uniform distributions, any plan in that space is theoretically guaranteed to execute in near-optimal time. LIP ensures that the execution time for every plan in the space is nearly-optimal. In this paper, we also evaluate these claims using workloads that include skew and correlation. With LIP we make an initial foray into a novel way of thinking about robustness from the perspective of query evaluation, where we develop strategies (like LIP) that collapse plan sub-spaces in the overall global plan space.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Anderson:2017:BGB, author = "Michael Anderson and Shaden Smith and Narayanan Sundaram and Mihai Capota and Zheguang Zhao and Subramanya Dulloor and Nadathur Satish and Theodore L. Willke", title = "Bridging the gap between {HPC} and big data frameworks", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "8", pages = "901--912", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3090163.3090168", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Apache Spark is a popular framework for data analytics with attractive features such as fault tolerance and interoperability with the Hadoop ecosystem. Unfortunately, many analytics operations in Spark are an order of magnitude or more slower compared to native implementations written with high performance computing tools such as MPI. There is a need to bridge the performance gap while retaining the benefits of the Spark ecosystem such as availability, productivity, and fault tolerance. In this paper, we propose a system for integrating MPI with Spark and analyze the costs and benefits of doing so for four distributed graph and machine learning applications. We show that offloading computation to an MPI environment from within Spark provides 3.1--17.7$ \times $ speedups on the four sparse applications, including all of the overheads. This opens up an avenue to reuse existing MPI libraries in Spark with little effort.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2017:RSS, author = "Keke Huang and Sibo Wang and Glenn Bevilacqua and Xiaokui Xiao and Laks V. S. Lakshmanan", title = "Revisiting the stop-and-stare algorithms for influence maximization", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "9", pages = "913--924", month = may, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Influence maximization is a combinatorial optimization problem that finds important applications in viral marketing, feed recommendation, etc. Recent research has led to a number of scalable approximation algorithms for influence maximization, such as TIM$^+$ and IMM, and more recently, SSA and D-SSA. The goal of this paper is to conduct a rigorous theoretical and experimental analysis of SSA and D-SSA and compare them against the preceding algorithms. In doing so, we uncover inaccuracies in previously reported technical results on the accuracy and efficiency of SSA and D-SSA, which we set right. We also attempt to reproduce the original experiments on SSA and D-SSA, based on which we provide interesting empirical insights. Our evaluation confirms some results reported from the original experiments, but it also reveals anomalies in some other results and sheds light on the behavior of SSA and D-SSA in some important settings not considered previously. We also report on the performance of SSA-Fix, our modification to SSA in order to restore the approximation guarantee that was claimed for but not enjoyed by SSA. Overall, our study suggests that there exist opportunities for further scaling up influence maximization with approximation guarantees.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2017:LSR, author = "Xubo Wang and Lu Qin and Xuemin Lin and Ying Zhang and Lijun Chang", title = "Leveraging set relations in exact set similarity join", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "9", pages = "925--936", month = may, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Exact set similarity join, which finds all the similar set pairs from two collections of sets, is a fundamental problem with a wide range of applications. The existing solutions for set similarity join follow a filtering-verification framework, which generates a list of candidate pairs through scanning indexes in the filtering phase, and reports those similar pairs in the verification phase. Though much research has been conducted on this problem, set relations, which we find out is quite effective on improving the algorithm efficiency through computational cost sharing, have never been studied. Therefore, in this paper, instead of considering each set individually, we explore the set relations in different levels to reduce the overall computational costs. First, it has been shown that most of the computational time is spent on the filtering phase, which can be quadratic to the number of sets in the worst case for the existing solutions. Thus we explore index-level set relations to reduce the filtering cost to be linear to the size of the input while keeping the same filtering power. We achieve this by grouping related sets into blocks in the index and skipping useless index probes in joins. Second, we explore answer-level set relations to further improve the algorithm based on the intuition that if two sets are similar, their answers may have a large overlap. We derive an algorithm which incrementally generates the answer of one set from an already computed answer of another similar set rather than compute the answer from scratch to reduce the computational cost. Finally, we conduct extensive performance studies using 21 real datasets with various data properties from a wide range of domains. The experimental results demonstrate that our algorithm outperforms all the existing algorithms across all datasets and can achieve more than an order of magnitude speedup against the state- of-the-art algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jiang:2017:RRW, author = "Minhao Jiang and Ada Wai-Chee Fu and Raymond Chi-Wing Wong", title = "{READS}: a random walk approach for efficient and accurate dynamic {SimRank}", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "9", pages = "937--948", month = may, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Similarity among entities in graphs plays a key role in data analysis and mining. SimRank is a widely used and popular measurement to evaluate the similarity among the vertices. In real-life applications, graphs do not only grow in size, requiring fast and precise SimRank computation for large graphs, but also change and evolve continuously over time, demanding an efficient maintenance process to handle dynamic updates. In this paper, we propose a random walk based indexing scheme to compute SimRank efficiently and accurately over large dynamic graphs. We show that our algorithm outperforms the state-of-the-art static and dynamic SimRank algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2017:ADC, author = "Xin Huang and Laks V. S. Lakshmanan", title = "Attribute-driven community search", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "9", pages = "949--960", month = may, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently, community search over graphs has gained significant interest. In applications such as analysis of protein-protein interaction (PPI) networks, citation graphs, and collaboration networks, nodes tend to have attributes. Unfortunately, most previous community search algorithms ignore attributes and result in communities with poor cohesion w.r.t. their node attributes. In this paper, we study the problem of attribute-driven community search, that is, given an undirected graph G where nodes are associated with attributes, and an input query Q consisting of nodes V$_q$ and attributes W$_q$, find the communities containing V$_q$, in which most community members are densely inter-connected and have similar attributes. We formulate this problem as finding attributed truss communities (ATC), i.e., finding connected and close k-truss subgraphs containing V$_q$, with the largest attribute relevance score. We design a framework of desirable properties that good score function should satisfy. We show that the problem is NP-hard. However, we develop an efficient greedy algorithmic framework to iteratively remove nodes with the least popular attributes, and shrink the graph into an ATC. In addition, we also build an elegant index to maintain k -truss structure and attribute information, and propose efficient query processing algorithms. Extensive experiments on large real-world networks with ground-truth communities show that our algorithms significantly outperform the state of the art and demonstrates their efficiency and effectiveness.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2017:BAS, author = "Jiecao Chen and Qin Zhang", title = "Bias-aware sketches", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "9", pages = "961--972", month = may, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Linear sketching algorithms have been widely used for processing large-scale distributed and streaming datasets. Their popularity is largely due to the fact that linear sketches can be naturally composed in the distributed model and be efficiently updated in the streaming model. The errors of linear sketches are typically expressed in terms of the sum of coordinates of the input vector excluding those largest ones, or, the mass on the tail of the vector. Thus, the precondition for these algorithms to perform well is that the mass on the tail is small, which is, however, not always the case --- in many real-world datasets the coordinates of the input vector have a bias, which will generate a large mass on the tail. In this paper we propose linear sketches that are bias- aware. We rigorously prove that they achieve strictly better error guarantees than the corresponding existing sketches, and demonstrate their practicality and superiority via an extensive experimental evaluation on both real and synthetic datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2017:DDA, author = "Yang Cao and Wenfei Fan", title = "Data driven approximation with bounded resources", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "9", pages = "973--984", month = may, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper proposes BEAS, a resource-bounded scheme for querying relations. It is parameterized with a resource ratio $ \alpha \in (0, 1] $, indicating that given a big dataset D, we can only afford to access an $ \alpha $-fraction of D with limited resources. For a query Q posed on D, BEAS computes exact answers Q(D) if doable and otherwise approximate answers, by accessing at most $ \alpha | D |$ amount of data in the entire process. Underlying BEAS are (1) an access schema, which helps us identify and fetch the part of data needed to answer Q, (2) an accuracy measure to assess approximate answers in terms of their relevance and coverage w.r.t. exact answers, (3) an Approximability Theorem for the feasibility of resource-bounded approximation, and (4) algorithms for query evaluation with bounded resources. A unique feature of BEAS is its ability to answer unpredictable queries, aggregate or not, using bounded resources and assuring a deterministic accuracy lower bound. Using real-life and synthetic data, we empirically verify the effectiveness and efficiency of BEAS.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Khayyat:2017:ELF, author = "Zuhair Khayyat and William Lucia and Meghna Singh and Mourad Ouzzani and Paolo Papotti and Jorge-Arnulfo Quian{\'e}-Ruiz and Nan Tang and Panos Kalnis", title = "Errata for {``Lightning Fast and Space Efficient Inequality Joins'' (PVLDB 8(13): 2074--2085)}", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "9", pages = "985--985", month = may, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", note = "See \cite{Khayyat:2015:LFS}.", abstract = "This is in response to recent feedback from some readers, which requires some clarifications regarding our IEJoin algorithm published in [1]. The feedback revolves around four points: (1) a typo in our illustrating example of the join process; (2) a naming error for the index used by our algorithm to improve the bit array scan; (3) the sort order used in our algorithms; and (4) a missing explanation on how duplicates are handled by our self join algorithm.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qin:2017:SAG, author = "Chengjie Qin and Martin Torres and Florin Rusu", title = "Scalable asynchronous gradient descent optimization for out-of-core models", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "10", pages = "986--997", month = jun, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Existing data analytics systems have approached predictive model training exclusively from a data-parallel perspective. Data examples are partitioned to multiple workers and training is executed concurrently over different partitions, under various synchronization policies that emphasize speedup or convergence. Since models with millions and even billions of features become increasingly common nowadays, model management becomes an equally important task for effective training. In this paper, we present a general framework for parallelizing stochastic optimization algorithms over massive models that cannot fit in memory. We extend the lock-free HOGWILD!-family of algorithms to disk-resident models by vertically partitioning the model offline and asynchronously updating the resulting partitions online. Unlike HOGWILD!, concurrent requests to the common model are minimized by a preemptive push-based sharing mechanism that reduces the number of disk accesses. Experimental results on real and synthetic datasets show that the proposed framework achieves improved convergence over HOGWILD! and is the only solution scalable to massive models.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2017:WEM, author = "Fan Zhang and Ying Zhang and Lu Qin and Wenjie Zhang and Xuemin Lin", title = "When engagement meets similarity: efficient $ (k, r)$-core computation on social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "10", pages = "998--1009", month = jun, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we investigate the problem of $ (k, r)$-core which intends to find cohesive subgraphs on social networks considering both user engagement and similarity perspectives. In particular, we adopt the popular concept of $k$-core to guarantee the engagement of the users (vertices) in a group (subgraph) where each vertex in a $ (k, r)$-core connects to at least k other vertices. Meanwhile, we consider the pairwise similarity among users based on their attributes. Efficient algorithms are proposed to enumerate all maximal $ (k, r)$-cores and find the maximum $ (k, r)$-core, where both problems are shown to be NP-hard. Effective pruning techniques substantially reduce the search space of two algorithms. A novel $ (k, k')$-core based $ (k, r)$-core size upper bound enhances performance of the maximum $ (k, r)$-core computation. We also devise effective search orders for two mining algorithms where search priorities for vertices are different. Comprehensive experiments on real-life data demonstrate that the maximal/maximum $ (k, r)$-cores enable us to find interesting cohesive subgraphs, and performance of two mining algorithms is effectively improved by proposed techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2017:EEP, author = "Yiding Liu and Tuan-Anh Nguyen Pham and Gao Cong and Quan Yuan", title = "An experimental evaluation of point-of-interest recommendation in location-based social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "10", pages = "1010--1021", month = jun, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Point-of-interest (POI) recommendation is an important service to Location-Based Social Networks (LBSNs) that can benefit both users and businesses. In recent years, a number of POI recommender systems have been proposed, but there is still a lack of systematical comparison thereof. In this paper, we provide an all-around evaluation of 12 state-of-the-art POI recommendation models. From the evaluation, we obtain several important findings, based on which we can better understand and utilize POI recommendation models in various scenarios. We anticipate this work to provide readers with an overall picture of the cutting-edge research on POI recommendation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Raasveldt:2017:DHM, author = "Mark Raasveldt and Hannes M{\"u}hleisen", title = "Don't hold my data hostage: a case for client protocol redesign", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "10", pages = "1022--1033", month = jun, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Transferring a large amount of data from a database to a client program is a surprisingly expensive operation. The time this requires can easily dominate the query execution time for large result sets. This represents a significant hurdle for external data analysis, for example when using statistical software. In this paper, we explore and analyse the result set serialization design space. We present experimental results from a large chunk of the database market and show the inefficiencies of current approaches. We then propose a columnar serialization method that improves transmission performance by an order of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhu:2017:AJJ, author = "Erkang Zhu and Yeye He and Surajit Chaudhuri", title = "{Auto-join}: joining tables by leveraging transformations", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "10", pages = "1034--1045", month = jun, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Traditional equi-join relies solely on string equality comparisons to perform joins. However, in scenarios such as ad-hoc data analysis in spreadsheets, users increasingly need to join tables whose join-columns are from the same semantic domain but use different textual representations, for which transformations are needed before equi-join can be performed. We developed Auto-Join, a system that can automatically search over a rich space of operators to compose a transformation program, whose execution makes input tables equi-join-able. We developed an optimal sampling strategy that allows Auto-Join to scale to large datasets efficiently, while ensuring joins succeed with high probability. Our evaluation using real test cases collected from both public web tables and proprietary enterprise tables shows that the proposed system performs the desired transformation joins efficiently and with high quality.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2017:TSD, author = "Aoqian Zhang and Shaoxu Song and Jianmin Wang and Philip S. Yu", title = "Time series data cleaning: from anomaly detection to anomaly repairing", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "10", pages = "1046--1057", month = jun, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Errors are prevalent in time series data, such as GPS trajectories or sensor readings. Existing methods focus more on anomaly detection but not on repairing the detected anomalies. By simply filtering out the dirty data via anomaly detection, applications could still be unreliable over the incomplete time series. Instead of simply discarding anomalies, we propose to (iteratively) repair them in time series data, by creatively bonding the beauty of temporal nature in anomaly detection with the widely considered minimum change principle in data repairing. Our major contributions include: (1) a novel framework of iterative minimum repairing (IMR) over time series data, (2) explicit analysis on convergence of the proposed iterative minimum repairing, and (3) efficient estimation of parameters in each iteration. Remarkably, with incremental computation, we reduce the complexity of parameter estimation from O (n) to O (1). Experiments on real datasets demonstrate the superiority of our proposal compared to the state-of-the-art approaches. In particular, we show that (the proposed) repairing indeed improves the time series classification application.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2017:PBM, author = "Lu Chen and Yunjun Gao and Baihua Zheng and Christian S. Jensen and Hanyu Yang and Keyu Yang", title = "Pivot-based metric indexing", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "10", pages = "1058--1069", month = jun, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The general notion of a metric space encompasses a diverse range of data types and accompanying similarity measures. Hence, metric search plays an important role in a wide range of settings, including multimedia retrieval, data mining, and data integration. With the aim of accelerating metric search, a collection of pivot-based indexing techniques for metric data has been proposed, which reduces the number of potentially expensive similarity comparisons by exploiting the triangle inequality for pruning and validation. However, no comprehensive empirical study of those techniques exists. Existing studies each offers only a narrower coverage, and they use different pivot selection strategies that affect performance substantially and thus render cross-study comparisons difficult or impossible. We offer a survey of existing pivot-based indexing techniques, and report a comprehensive empirical comparison of their construction costs, update efficiency, storage sizes, and similarity search performance. As part of the study, we provide modifications for two existing indexing techniques to make them more competitive. The findings and insights obtained from the study reveal different strengths and weaknesses of different indexing techniques, and offer guidance on selecting an appropriate indexing technique for a given setting.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Guerraoui:2017:HRW, author = "Rachid Guerraoui and Anne-Marie Kermarrec and Tao Lin and Rhicheek Patra", title = "Heterogeneous recommendations: what you might like to read after watching interstellar", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "10", pages = "1070--1081", month = jun, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recommenders, as widely implemented nowadays by major e-commerce players like Netflix or Amazon, use collaborative filtering to suggest the most relevant items to their users. Clearly, the effectiveness of recommenders depends on the data they can exploit, i.e., the feedback of users conveying their preferences, typically based on their past ratings. As of today, most recommenders are homogeneous in the sense that they utilize one specific application at a time. In short, Alice will only get recommended a movie if she has been rating movies. But what if she has been only rating books and would like to get recommendations for a movie? Clearly, the multiplicity of web applications is calling for heterogeneous recommenders that could utilize ratings in one application to provide recommendations in another one. This paper presents X-M ap, a heterogeneous recommender. X-Map leverages meta-paths between heterogeneous items over several application domains, based on users who rated across these domains. These meta-paths are then used in X-Map to generate, for every user, a profile (AlterEgo ) in a domain where the user might not have rated any item yet. Not surprisingly, leveraging meta-paths poses non-trivial issues of (a) meta-path-based inter-item similarity, in order to enable accurate predictions, (b) scalability, given the amount of computation required, as well as (c) privacy, given the need to aggregate information across multiple applications. We show in this paper how X-M ap addresses the above-mentioned issues to achieve accuracy, scalability and differential privacy. In short, X-Map weights the meta-paths based on several factors to compute inter-item similarities, and ensures scalability through a layer-based pruning technique. X-Map guarantees differential privacy using an exponential scheme that leverages the meta-path-based similarities while determining the probability of item selection to construct the AlterEgos. We present an exhaustive experimental evaluation of X-Map using real traces from Amazon. We show that, in terms of accuracy, X-Map outperforms alternative heterogeneous recommenders and, in terms of throughput, X-Map achieves a linear speedup with an increasing number of machines.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deng:2017:SEM, author = "Dong Deng and Albert Kim and Samuel Madden and Michael Stonebraker", title = "{SilkMoth}: an efficient method for finding related sets with maximum matching constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "10", pages = "1082--1093", month = jun, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Determining if two sets are related --- that is, if they have similar values or if one set contains the other --- is an important problem with many applications in data cleaning, data integration, and information retrieval. For example, set relatedness can be a useful tool to discover whether columns from two different databases are joinable; if enough of the values in the columns match, it may make sense to join them. A common metric is to measure the relatedness of two sets by treating the elements as vertices of a bipartite graph and calculating the score of the maximum matching pairing between elements. Compared to other metrics which require exact matchings between elements, this metric uses a similarity function to compare elements between the two sets, making it robust to small dissimilarities in elements and more useful for real-world, dirty data. Unfortunately, the metric suffers from expensive computational cost, taking O ( n$^3$) time, where n is the number of elements in the sets, for each set-to-set comparison. Thus for applications that try to search for all pairings of related sets in a brute-force manner, the runtime becomes unacceptably large. To address this challenge, we developed SilkMoth, a system capable of rapidly discovering related set pairs in collections of sets. Internally, SilkMoth creates a signature for each set, with the property that any other set which is related must match the signature. SilkMoth then uses these signatures to prune the search space, so only sets that match the signatures are left as candidates. Finally, SilkMoth applies the maximum matching metric on remaining candidates to verify which of these candidates are truly related sets. An important property of SilkMoth is that it is guaranteed to output exactly the same related set pairings as the brute-force method, unlike approximate techniques. Thus, a contribution of this paper is the characterization of the space of signatures which enable this property. We show that selecting the optimal signature in this space is NP-complete, and based on insights from the characterization of the space, we propose two novel filters which help to prune the candidates further before verification. In addition, we introduce a simple optimization to the calculation of the maximum matching metric itself based on the triangle inequality. Compared to related approaches, SilkMoth is much more general, handling a larger space of similarity functions and relatedness metrics, and is an order of magnitude more efficient on real datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chung:2017:DQM, author = "Yeounoh Chung and Sanjay Krishnan and Tim Kraska", title = "A data quality metric {(DQM)}: how to estimate the number of undetected errors in data sets", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "10", pages = "1094--1105", month = jun, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data cleaning, whether manual or algorithmic, is rarely perfect leaving a dataset with an unknown number of false positives and false negatives after cleaning. In many scenarios, quantifying the number of remaining errors is challenging because our data integrity rules themselves may be incomplete, or the available gold-standard datasets may be too small to extrapolate. As the use of inherently fallible crowds becomes more prevalent in data cleaning problems, it is important to have estimators to quantify the extent of such errors. We propose novel species estimators to estimate the number of distinct remaining errors in a dataset after it has been cleaned by a set of crowd workers --- essentially, quantifying the utility of hiring additional workers to clean the dataset. This problem requires new estimators that are robust to false positives and false negatives, and we empirically show on three real-world datasets that existing species estimators are unstable for this problem, while our proposed techniques quickly converge.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Olma:2017:SCT, author = "Matthaios Olma and Manos Karpathiotakis and Ioannis Alagiannis and Manos Athanassoulis and Anastasia Ailamaki", title = "{Slalom}: coasting through raw data via adaptive partitioning and indexing", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "10", pages = "1106--1117", month = jun, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The constant flux of data and queries alike has been pushing the boundaries of data analysis systems. The increasing size of raw data files has made data loading an expensive operation that delays the data-to-insight time. Hence, recent in-situ query processing systems operate directly over raw data, alleviating the loading cost. At the same time, analytical workloads have increasing number of queries. Typically, each query focuses on a constantly shifting --- yet small --- range. Minimizing the workload latency, now, requires the benefits of indexing in in-situ query processing. In this paper, we present Slalom, an in-situ query engine that accommodates workload shifts by monitoring user access patterns. Slalom makes on-the-fly partitioning and indexing decisions, based on information collected by lightweight monitoring. Slalom has two key components: (i) an online partitioning and indexing scheme, and (ii) a partitioning and indexing tuner tailored for in-situ query engines. When compared to the state of the art, Slalom offers performance benefits by taking into account user query patterns to (a) logically partition raw data files and (b) build for each partition lightweight partition-specific indexes. Due to its lightweight and adaptive nature, Slalom achieves efficient accesses to raw data with minimal memory consumption. Our experimentation with both micro-benchmarks and real-life workloads shows that Slalom outperforms state-of-the-art in-situ engines (3--10$ \times $), and achieves comparable query response times with fully indexed DBMS, offering much lower ($ \approx 3 \times $) cumulative query execution times for query workloads with increasing size and unpredictable access patterns.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2017:MFJ, author = "Yinan Li and Nikos R. Katsipoulakis and Badrish Chandramouli and Jonathan Goldstein and Donald Kossmann", title = "{Mison}: a fast {JSON} parser for data analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "10", pages = "1118--1129", month = jun, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The growing popularity of the JSON format has fueled increased interest in loading and processing JSON data within analytical data processing systems. However, in many applications, JSON parsing dominates performance and cost. In this paper, we present a new JSON parser called Mison that is particularly tailored to this class of applications, by pushing down both projection and filter operators of analytical queries into the parser. To achieve these features, we propose to deviate from the traditional approach of building parsers using finite state machines (FSMs). Instead, we follow a two-level approach that enables the parser to jump directly to the correct position of a queried field without having to perform expensive tokenizing steps to find the field. At the upper level, Mison speculatively predicts the logical locations of queried fields based on previously seen patterns in a dataset. At the lower level, Mison builds structural indices on JSON data to map logical locations to physical locations. Unlike all existing FSM-based parsers, building structural indices converts control flow into data flow, thereby largely eliminating inherently unpredictable branches in the program and exploiting the parallelism available in modern processors. We experimentally evaluate Mison using representative real-world JSON datasets and the TPC-H benchmark, and show that Mison produces significant performance benefits over the best existing JSON parsers; in some cases, the performance improvement is over one order of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2017:OBV, author = "Silu Huang and Liqi Xu and Jialin Liu and Aaron J. Elmore and Aditya Parameswaran", title = "{OrpheusDB}: bolt-on versioning for relational databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "10", pages = "1130--1141", month = jun, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data science teams often collaboratively analyze datasets, generating dataset versions at each stage of iterative exploration and analysis. There is a pressing need for a system that can support dataset versioning, enabling such teams to efficiently store, track, and query across dataset versions. We introduce OrpheusDB, a dataset version control system that ``bolts on '' versioning capabilities to a traditional relational database system, thereby gaining the analytics capabilities of the database ``for free''. We develop and evaluate multiple data models for representing versioned data, as well as a light-weight partitioning scheme, LyreSplit, to further optimize the models for reduced query latencies. With LyreSplit, OrpheusDB is on average $ 10^3 \times $ faster in finding effective (and better) partitionings than competing approaches, while also reducing the latency of version retrieval by up to $ 20 \times $ relative to schemes without partitioning. LyreSplit can be applied in an online fashion as new versions are added, alongside an intelligent migration scheme that reduces migration time by $ 10 \times $ on average.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Galakatos:2017:RRA, author = "Alex Galakatos and Andrew Crotty and Emanuel Zgraggen and Carsten Binnig and Tim Kraska", title = "Revisiting reuse for approximate query processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "10", pages = "1142--1153", month = jun, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Visual data exploration tools allow users to quickly gather insights from new datasets. As dataset sizes continue to increase, though, new techniques will be necessary to maintain the interactivity guarantees that these tools require. Approximate query processing (AQP) attempts to tackle this problem and allows systems to return query results at ``human speed.'' However, existing AQP techniques start to break down when confronted with ad hoc queries that target the tails of the distribution. We therefore present an AQP formulation that can provide low-error approximate results at interactive speeds, even for queries over rare subpopulations. In particular, our formulation treats query results as random variables in order to leverage the ample opportunities for result reuse inherent in interactive data exploration. As part of our approach, we apply a variety of optimization techniques that are based on probability theory, including new query rewrite rules and index structures. We implemented these techniques in a prototype system and show that they can achieve interactivity where alternative approaches cannot.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Orr:2017:PDS, author = "Laurel Orr and Magdalena Balazinska and Dan Suciu", title = "Probabilistic database summarization for interactive data exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "10", pages = "1154--1165", month = jun, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 17:12:46 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present a probabilistic approach to generate a small, query-able summary of a dataset for interactive data exploration. Departing from traditional summarization techniques, we use the Principle of Maximum Entropy to generate a probabilistic representation of the data that can be used to give approximate query answers. We develop the theoretical framework and formulation of our probabilistic representation and show how to use it to answer queries. We then present solving techniques and give three critical optimizations to improve preprocessing time and query accuracy. Lastly, we experimentally evaluate our work using a 5 GB dataset of flights within the United States and a 210 GB dataset from an astronomy particle simulation. While our current work only supports linear queries, we show that our technique can successfully answer queries faster than sampling while introducing, on average, no more error than sampling and can better distinguish between rare and nonexistent values.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Oukid:2017:MMT, author = "Ismail Oukid and Daniel Booss and Adrien Lespinasse and Wolfgang Lehner and Thomas Willhalm and Gr{\'e}goire Gomes", title = "Memory management techniques for large-scale persistent-main-memory systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1166--1177", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Storage Class Memory (SCM) is a novel class of memory technologies that promise to revolutionize database architectures. SCM is byte-addressable and exhibits latencies similar to those of DRAM, while being non-volatile. Hence, SCM could replace both main memory and storage, enabling a novel single-level database architecture without the traditional I/O bottleneck. Fail-safe persistent SCM allocation can be considered conditio sine qua non for enabling this novel architecture paradigm for database management systems. In this paper we present PAllocator, a fail-safe persistent SCM allocator whose design emphasizes high concurrency and capacity scalability. Contrary to previous works, PAllocator thoroughly addresses the important challenge of persistent memory fragmentation by implementing an efficient defragmentation algorithm. We show that PAllocator outperforms state-of-the-art persistent allocators by up to one order of magnitude, both in operation throughput and recovery time, and enables up to $ 2.39 \times $ higher operation throughput on a persistent B-Tree.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shang:2017:TSJ, author = "Shuo Shang and Lisi Chen and Zhewei Wei and Christian S. Jensen and Kai Zheng and Panos Kalnis", title = "Trajectory similarity join in spatial networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1178--1189", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The matching of similar pairs of objects, called similarity join, is fundamental functionality in data management. We consider the case of trajectory similarity join (TS-Join), where the objects are trajectories of vehicles moving in road networks. Thus, given two sets of trajectories and a threshold $ \theta $, the TS-Join returns all pairs of trajectories from the two sets with similarity above $ \theta $. This join targets applications such as trajectory near-duplicate detection, data cleaning, ridesharing recommendation, and traffic congestion prediction. With these applications in mind, we provide a purposeful definition of similarity. To enable efficient TS-Join processing on large sets of trajectories, we develop search space pruning techniques and take into account the parallel processing capabilities of modern processors. Specifically, we present a two-phase divide-and-conquer algorithm. For each trajectory, the algorithm first finds similar trajectories. Then it merges the results to achieve a final result. The algorithm exploits an upper bound on the spatiotemporal similarity and a heuristic scheduling strategy for search space pruning. The algorithm's per-trajectory searches are independent of each other and can be performed in parallel, and the merging has constant cost. An empirical study with real data offers insight in the performance of the algorithm and demonstrates that is capable of outperforming a well-designed baseline algorithm by an order of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rekatsinas:2017:HHD, author = "Theodoros Rekatsinas and Xu Chu and Ihab F. Ilyas and Christopher R{\'e}", title = "{HoloClean}: holistic data repairs with probabilistic inference", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1190--1201", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We introduce HoloClean, a framework for holistic data repairing driven by probabilistic inference. HoloClean unifies qualitative data repairing, which relies on integrity constraints or external data sources, with quantitative data repairing methods, which leverage statistical properties of the input data. Given an inconsistent dataset as input, HoloClean automatically generates a probabilistic program that performs data repairing. Inspired by recent theoretical advances in probabilistic inference, we introduce a series of optimizations which ensure that inference over HoloClean's probabilistic model scales to instances with millions of tuples. We show that HoloClean finds data repairs with an average precision of $ \approx $ 90\% and an average recall of above $ \approx $ 76\% across a diverse array of datasets exhibiting different types of errors. This yields an average F1 improvement of more than $ 2 \times $ against state-of-the-art methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Istvan:2017:CID, author = "Zsolt Istv{\'a}n and David Sidler and Gustavo Alonso", title = "{Caribou}: intelligent distributed storage", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1202--1213", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The ever increasing amount of data being handled in data centers causes an intrinsic inefficiency: moving data around is expensive in terms of bandwidth, latency, and power consumption, especially given the low computational complexity of many database operations. In this paper we explore near-data processing in database engines, i.e., the option of offloading part of the computation directly to the storage nodes. We implement our ideas in Caribou, an intelligent distributed storage layer incorporating many of the lessons learned while building systems with specialized hardware. Caribou provides access to DRAM/NVRAM storage over the network through a simple key--value store interface, with each storage node providing high-bandwidth near-data processing at line rate and fault tolerance through replication. The result is a highly efficient, distributed, intelligent data storage that can be used to both boost performance and reduce power consumption and real estate usage in the data center thanks to the micro-server architecture adopted.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2017:TLA, author = "Lingjiao Chen and Arun Kumar and Jeffrey Naughton and Jignesh M. Patel", title = "Towards linear algebra over normalized data", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1214--1225", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Providing machine learning (ML) over relational data is a mainstream requirement for data analytics systems. While almost all ML tools require the input data to be presented as a single table, many datasets are multi-table. This forces data scientists to join those tables first, which often leads to data redundancy and runtime waste. Recent works on ``factorized'' ML mitigate this issue for a few specific ML algorithms by pushing ML through joins. But their approaches require a manual rewrite of ML implementations. Such piecemeal methods create a massive development overhead when extending such ideas to other ML algorithms. In this paper, we show that it is possible to mitigate this overhead by leveraging a popular formal algebra to represent the computations of many ML algorithms: linear algebra. We introduce a new logical data type to represent normalized data and devise a framework of algebraic rewrite rules to convert a large set of linear algebra operations over denormalized data into operations over normalized data. We show how this enables us to automatically ``factorize'' several popular ML algorithms, thus unifying and generalizing several prior works. We prototype our framework in the popular ML environment R and an industrial R-over-RDBMS tool. Experiments with both synthetic and real normalized data show that our framework also yields significant speed-ups, up to $ 36 \times $ on real data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mehta:2017:CEB, author = "Parmita Mehta and Sven Dorkenwald and Dongfang Zhao and Tomer Kaftan and Alvin Cheung and Magdalena Balazinska and Ariel Rokem and Andrew Connolly and Jacob Vanderplas and Yusra AlSayyad", title = "Comparative evaluation of big-data systems on scientific image analytics workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1226--1237", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Scientific discoveries are increasingly driven by analyzing large volumes of image data. Many new libraries and specialized database management systems (DBMSs) have emerged to support such tasks. It is unclear how well these systems support real-world image analysis use cases, and how performant the image analytics tasks implemented on top of such systems are. In this paper, we present the first comprehensive evaluation of large-scale image analysis systems using two real-world scientific image data processing use cases. We evaluate five representative systems (SciDB, Myria, Spark, Dask, and TensorFlow) and find that each of them has shortcomings that complicate implementation or hurt performance. Such shortcomings lead to new research opportunities in making large-scale image analysis both efficient and easy to use.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Aslay:2017:RMI, author = "Cigdem Aslay and Francesco Bonchi and Laks V. S. Lakshmanan and Wei Lu", title = "Revenue maximization in incentivized social advertising", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1238--1249", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Incentivized social advertising, an emerging marketing model, provides monetization opportunities not only to the owners of the social networking platforms but also to their influential users by offering a ``cut'' on the advertising revenue. We consider a social network (the host) that sells ad-engagements to advertisers by inserting their ads, in the form of promoted posts, into the feeds of carefully selected ``initial endorsers'' or seed users: these users receive monetary incentives in exchange for their endorsements. The endorsements help propagate the ads to the feeds of their followers. Whenever any user engages with an ad, the host is paid some fixed amount by the advertiser, and the ad further propagates to the feed of her followers, potentially recursively. In this context, the problem for the host is is to allocate ads to influential users, taking into account the propensity of ads for viral propagation, and carefully apportioning the monetary budget of each of the advertisers between incentives to influential users and ad-engagement costs, with the rational goal of maximizing its own revenue. We show that, taking all important factors into account, the problem of revenue maximization in incentivized social advertising corresponds to the problem of monotone submodular function maximization, subject to a partition matroid constraint on the ads-to-seeds allocation, and submodular knapsack constraints on the advertisers' budgets. We show that this problem is NP-hard and devise two greedy algorithms with provable approximation guarantees, which differ in their sensitivity to seed user incentive costs. Our approximation algorithms require repeatedly estimating the expected marginal gain in revenue as well as in advertiser payment. By exploiting a connection to the recent advances made in scalable estimation of expected influence spread, we devise efficient and scalable versions of our two greedy algorithms. An extensive experimental assessment confirms the high quality of our proposal.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rupprecht:2017:SNA, author = "Lukas Rupprecht and William Culhane and Peter Pietzuch", title = "{SquirrelJoin}: network-aware distributed join processing with lazy partitioning", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1250--1261", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To execute distributed joins in parallel on compute clusters, systems partition and exchange data records between workers. With large datasets, workers spend a considerable amount of time transferring data over the network. When compute clusters are shared among multiple applications, workers must compete for network bandwidth with other applications. These variances in the available network bandwidth lead to network skew, which causes straggling workers to prolong the join completion time. We describe SquirrelJoin, a distributed join processing technique that uses lazy partitioning to adapt to transient network skew in clusters. Workers maintain in-memory lazy partitions to withhold a subset of records, i.e. not sending them immediately to other workers for processing. Lazy partitions are then assigned dynamically to other workers based on network conditions: each worker takes periodic throughput measurements to estimate its completion time, and lazy partitions are allocated as to minimise the join completion time. We implement SquirrelJoin as part of the Apache Flink distributed dataflow framework and show that, under transient network contention in a shared compute cluster, SquirrelJoin speeds up join completion times by up to $ 2.9 \times $ with only a small, fixed overhead.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rahman:2017:ISE, author = "Sajjadur Rahman and Maryam Aliakbarpour and Ha Kyung Kong and Eric Blais and Karrie Karahalios and Aditya Parameswaran and Ronitt Rubinfield", title = "{I}'ve seen ``enough'': incrementally improving visualizations to support rapid decision making", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1262--1273", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data visualization is an effective mechanism for identifying trends, insights, and anomalies in data. On large datasets, however, generating visualizations can take a long time, delaying the extraction of insights, hampering decision making, and reducing exploration time. One solution is to use online sampling-based schemes to generate visualizations faster while improving the displayed estimates incrementally, eventually converging to the exact visualization computed on the entire data. However, the intermediate visualizations are approximate, and often fluctuate drastically, leading to potentially incorrect decisions. We propose sampling-based incremental visualization algorithms that reveal the ``salient'' features of the visualization quickly --- with a $ 46 \times $ speedup relative to baselines --- while minimizing error, thus enabling rapid and error-free decision making. We demonstrate that these algorithms are optimal in terms of sample complexity, in that given the level of interactivity, they generate approximations that take as few samples as possible. We have developed the algorithms in the context of an incremental visualization tool, titled I ncVisage, for trendline and heatmap visualizations. We evaluate the usability of IncVisage via user studies and demonstrate that users are able to make effective decisions with incrementally improving visualizations, especially compared to vanilla online-sampling based schemes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2017:MRT, author = "Lei Li and Wen Hua and Xingzhong Du and Xiaofang Zhou", title = "Minimal on-road time route scheduling on time-dependent graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1274--1285", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "On time-dependent graphs, fastest path query is an important problem and has been well studied. It focuses on minimizing the total travel time (waiting time + on-road time) but does not allow waiting on any intermediate vertex if the FIFO property is applied. However, in practice, waiting on a vertex can reduce the time spent on the road (for example, resuming traveling after a traffic jam). In this paper, we study how to find a path with the minimal on-road time on time-dependent graphs by allowing waiting on some predefined parking vertices. The existing works are based on the following fact: the arrival time of a vertex v is determined by the arrival time of its in-neighbor u, which does not hold in our scenario since we also consider the waiting time on u if u allows waiting. Thus, determining the waiting time on each parking vertex to achieve the minimal on-road time becomes a big challenge, which further breaks FIFO property. To cope with this challenging problem, we propose two efficient algorithms using minimum on-road travel cost function to answer the query. The evaluations on multiple real-world time-dependent graphs show that the proposed algorithms are more accurate and efficient than the extensions of existing algorithms. In addition, the results further indicate, if the parking facilities are enabled in the route scheduling algorithms, the on-road time will reduce significantly compared to the fastest path algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Katsipoulakis:2017:HVS, author = "Nikos R. Katsipoulakis and Alexandros Labrinidis and Panos K. Chrysanthis", title = "A holistic view of stream partitioning costs", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1286--1297", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Stream processing has become the dominant processing model for monitoring and real-time analytics. Modern Parallel Stream Processing Engines (pSPEs) have made it feasible to increase the performance in both monitoring and analytical queries by parallelizing a query's execution and distributing the load on multiple workers. A determining factor for the performance of a pSPE is the partitioning algorithm used to disseminate tuples to workers. Until now, partitioning methods in pSPEs have been similar to the ones used in parallel databases and only recently load-aware algorithms have been employed to improve the effectiveness of parallel execution. We identify and demonstrate the need to incorporate aggregation costs in the partitioning model when executing stateful operations in parallel, in order to minimize the overall latency and/or through-put. Towards this, we propose new stream partitioning algorithms, that consider both tuple imbalance and aggregation cost. We evaluate our proposed algorithms and show that they can achieve up to an order of magnitude better performance, compared to the current state of the art.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Akbas:2017:TBC, author = "Esra Akbas and Peixiang Zhao", title = "Truss-based community search: a truss-equivalence based indexing approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1298--1309", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We consider the community search problem defined upon a large graph G: given a query vertex q in G, to find as output all the densely connected subgraphs of G, each of which contains the query v. As an online, query-dependent variant of the well-known community detection problem, community search enables personalized community discovery that has found widely varying applications in real-world, large-scale graphs. In this paper, we study the community search problem in the truss-based model aimed at discovering all dense and cohesive k -truss communities to which the query vertex q belongs. We introduce a novel equivalence relation, k-truss equivalence, to model the intrinsic density and cohesiveness of edges in k -truss communities. Consequently, all the edges of G can be partitioned to a series of k -truss equivalence classes that constitute a space-efficient, truss-preserving index structure, EquiTruss. Community search can be henceforth addressed directly upon EquiTruss without repeated, time-demanding accesses to the original graph, G, which proves to be theoretically optimal. In addition, EquiTruss can be efficiently updated in a dynamic fashion when G evolves with edge insertion and deletion. Experimental studies in real-world, large-scale graphs validate the efficiency and effectiveness of EquiTruss, which has achieved at least an order of magnitude speedup in community search over the state-of-the-art method, TCP-Index.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cambronero:2017:QOD, author = "Jos{\'e} Cambronero and John K. Feser and Micah J. Smith and Samuel Madden", title = "Query optimization for dynamic imputation", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1310--1321", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Missing values are common in data analysis and present a usability challenge. Users are forced to pick between removing tuples with missing values or creating a cleaned version of their data by applying a relatively expensive imputation strategy. Our system, ImputeDB, incorporates imputation into a cost-based query optimizer, performing necessary imputations on-the-fly for each query. This allows users to immediately explore their data, while the system picks the optimal placement of imputation operations. We evaluate this approach on three real-world survey-based datasets. Our experiments show that our query plans execute between 10 and 140 times faster than first imputing the base tables. Furthermore, we show that the query results from on-the-fly imputation differ from the traditional base-table imputation approach by 0--8\%. Finally, we show that while dropping tuples with missing values that fail query constraints discards 6--78\% of the data, on-the-fly imputation loses only 0--21\%.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Marchant:2017:SER, author = "Neil G. Marchant and Benjamin I. P. Rubinstein", title = "In search of an entity resolution {OASIS}: optimal asymptotic sequential importance sampling", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1322--1333", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Entity resolution (ER) presents unique challenges for evaluation methodology. While crowdsourcing platforms acquire ground truth, sound approaches to sampling must drive labelling efforts. In ER, extreme class imbalance between matching and non-matching records can lead to enormous labelling requirements when seeking statistically consistent estimates for rigorous evaluation. This paper addresses this important challenge with the OASIS algorithm: a sampler and F-measure estimator for ER evaluation. OASIS draws samples from a (biased) instrumental distribution, chosen to ensure estimators with optimal asymptotic variance. As new labels are collected OASIS updates this instrumental distribution via a Bayesian latent variable model of the annotator oracle, to quickly focus on unlabelled items providing more information. We prove that resulting estimates of F-measure, precision, recall converge to the true population values. Thorough comparisons of sampling methods on a variety of ER datasets demonstrate significant labelling reductions of up to 83\% without loss to estimate accuracy.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tong:2017:FOT, author = "Yongxin Tong and Libin Wang and Zimu Zhou and Bolin Ding and Lei Chen and Jieping Ye and Ke Xu", title = "Flexible online task assignment in real-time spatial data", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1334--1345", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The popularity of Online To Offline (O2O) service platforms has spurred the need for online task assignment in real-time spatial data, where streams of spatially distributed tasks and workers are matched in real time such that the total number of assigned pairs is maximized. Existing online task assignment models assume that each worker is either assigned a task immediately or waits for a subsequent task at a fixed location once she/he appears on the platform. Yet in practice a worker may actively move around rather than passively wait in place if no task is assigned. In this paper, we define a new problem Flexible Two-sided Online task Assignment (FTOA). FTOA aims to guide idle workers based on the prediction of tasks and workers so as to increase the total number of assigned worker-task pairs. To address the FTOA problem, we face two challenges: (i) How to generate guidance for idle workers based on the prediction of the spatiotemporal distribution of tasks and workers? (ii) How to leverage the guidance of workers' movements to optimize the online task assignment? To this end, we propose a novel two-step framework, which integrates offline prediction and online task assignment. Specifically, we estimate the distributions of tasks and workers per time slot and per unit area, and design an online task assignment algorithm, Prediction-oriented Online task Assignment in Real-time spatial data (POLAR-OP). It yields a 0.47-competitive ratio, which is nearly twice better than that of the state-of-the-art. POLAR-OP also reduces the time complexity to process each newly-arrived task/worker to $ O(1) $. We validate the effectiveness and efficiency of our methods via extensive experiments on both synthetic datasets and real-world datasets from a large-scale taxi-calling platform.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bouros:2017:FSB, author = "Panagiotis Bouros and Nikos Mamoulis", title = "A forward scan based plane sweep algorithm for parallel interval joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1346--1357", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The interval join is a basic operation that finds application in temporal, spatial, and uncertain databases. Although a number of centralized and distributed algorithms have been proposed for the efficient evaluation of interval joins, classic plane sweep approaches have not been considered at their full potential. A recent piece of related work proposes an optimized approach based on plane sweep (PS) for modern hardware, showing that it greatly outperforms previous work. However, this approach depends on the development of a complex data structure and its parallelization has not been adequately studied. In this paper, we explore the applicability of a largely ignored forward scan (FS) based plane sweep algorithm, which is extremely simple to implement. We propose two optimizations of FS that greatly reduce its cost, making it competitive to the state-of-the-art single-threaded PS algorithm while achieving a lower memory footprint. In addition, we show the drawbacks of a previously proposed hash-based partitioning approach for parallel join processing and suggest a domain-based partitioning approach that does not produce duplicate results. Within our approach we propose a novel breakdown of the partition join jobs into a small number of independent mini-join jobs with varying cost and manage to avoid redundant comparisons. Finally, we show how these mini-joins can be scheduled in multiple CPU cores and propose an adaptive domain partitioning, aiming at load balancing. We include an experimental study that demonstrates the efficiency of our optimized FS and the scalability of our parallelization framework.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rong:2017:APA, author = "Kexin Rong and Peter Bailis", title = "{ASAP}: prioritizing attention via time series smoothing", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1358--1369", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Time series visualization of streaming telemetry (i.e., charting of key metrics such as server load over time) is increasingly prevalent in modern data platforms and applications. However, many existing systems simply plot the raw data streams as they arrive, often obscuring large-scale trends due to small-scale noise. We propose an alternative: to better prioritize end users' attention, smooth time series visualizations as much as possible to remove noise, while retaining large-scale structure to highlight significant deviations. We develop a new analytics operator called ASAP that automatically smooths streaming time series by adaptively optimizing the trade-off between noise reduction (i.e., variance) and trend retention (i.e., kurtosis). We introduce metrics to quantitatively assess the quality of smoothed plots and provide an efficient search strategy for optimizing these metrics that combines techniques from stream processing, user interface design, and signal processing via autocorrelation-based pruning, pixel-aware preaggregation, and on-demand refresh. We demonstrate that ASAP can improve users' accuracy in identifying long-term deviations in time series by up to 38.4\% while reducing response times by up to 44.3\%. Moreover, ASAP delivers these results several orders of magnitude faster than alternative search strategies.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2017:KVL, author = "Furong Li and Xin Luna Dong and Anno Langen and Yang Li", title = "Knowledge verification for long-tail verticals", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1370--1381", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Collecting structured knowledge for real-world entities has become a critical task for many applications. A big gap between the knowledge in existing knowledge repositories and the knowledge in the real world is the knowledge on tail verticals (i.e., less popular domains). Such knowledge, though not necessarily globally popular, can be personal hobbies to many people and thus collectively impactful. This paper studies the problem of knowledge verification for tail verticals; that is, deciding the correctness of a given triple. Through comprehensive experimental study we answer the following questions. (1) Can we find evidence for tail knowledge from an extensive set of sources, including knowledge bases, the web, and query logs? (2) Can we judge correctness of the triples based on the collected evidence? (3) How can we further improve knowledge verification on tail verticals? Our empirical study suggests a new knowledge-verification framework, which we call Facty, that applies various kinds of evidence collection techniques followed by knowledge fusion. Facty can verify 50\% of the (correct) tail knowledge with a precision of 84\%, and it significantly outperforms state-of-the-art methods. Detailed error analysis on the obtained results suggests future research directions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pande:2017:SRR, author = "Shiladitya Pande and Sayan Ranu and Arnab Bhattacharya", title = "{SkyGraph}: retrieving regions of interest using skyline subgraph queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1382--1393", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Several services today are annotated with points of interest (PoIs) such as ``coffee shop'', ``park'', etc. A region of interest (RoI) is a neighborhood that contains PoIs relevant to the user. In this paper, we study the scenario where a user wants to identify the best RoI in a city. The user expresses relevance through a set of keywords denoting PoIs. Ideally, the RoI should be small enough in size such that the user can conveniently explore the PoIs. On the other hand, it should be as relevant as possible. How does one balance the importance of size versus relevance? To a user exploring the RoI on foot, size is more critical. However, for a user equipped with a vehicle, relevance is a more important factor. In this paper, we solve this dilemma through skyline subgraph queries on keyword-embedded road networks. Skyline subgraphs subsume the choice of optimization function for an RoI since the optimal RoI for any rational user is necessarily a part of the skyline set. Our analysis reveals that the problem of computing the skyline set is NP-hard. We overcome the computational bottleneck by proposing a polynomial-time approximation algorithm called SkyGraph. To further expedite the running time, we develop an index structure, Partner Index, that drastically prunes the search space and provides up to 3 orders of magnitude speed-up on real road networks over the baseline approach. The datasets and executables are available at http://www.cse.iitd.ac.in/~sayan/software.html.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tan:2017:REA, author = "Wei Chit Tan and Meihui Zhang and Hazem Elmeleegy and Divesh Srivastava", title = "Reverse engineering aggregation queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1394--1405", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query reverse engineering seeks to re-generate the SQL query that produced a given query output table from a given database. In this paper, we solve this problem for OLAP queries with group-by and aggregation. We develop a novel three-phase algorithm named REGAL$^1$ for this problem. First, based on a lattice graph structure, we identify a set of group-by candidates for the desired query. Second, we apply a set of aggregation constraints that are derived from the properties of aggregate operators at both the table-level and the group-level to discover candidate combinations of group-by columns and aggregations that are consistent with the given query output table. Finally, we find a multi-dimensional filter, i.e., a conjunction of selection predicates over the base table attributes, that is needed to generate the exact query output table. We conduct an extensive experimental study over the TPC-H dataset to demonstrate the effectiveness and efficiency of our proposal.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yut:2017:LRL, author = "Lele Yut and Ce Zhang and Yingxia Shao and Bin Cui", title = "{LDA*}: a robust and large-scale topic modeling system", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1406--1417", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present LDA*, a system that has been deployed in one of the largest Internet companies to fulfil their requirements of ``topic modeling as an internal service'' --- relying on thousands of machines, engineers in different sectors submit their data, some are as large as 1.8TB, to LDA* and get results back in hours. LDA* is motivated by the observation that none of the existing topic modeling systems is robust enough --- Each of these existing systems is designed for a specific point in the tradeoff space that can be sub-optimal, sometimes by up to $ 10 \times $, across workloads. Our first contribution is a systematic study of all recently proposed samplers: AliasLDA, F+LDA, LightLDA, and WarpLDA. We discovered a novel system tradeoff among these samplers. Each sampler has different sampling complexity and performs differently, sometimes by $ 5 \times $, on documents with different lengths. Based on this tradeoff, we further developed a hybrid sampler that uses different samplers for different types of documents. This hybrid approach works across a wide range of workloads and outperforms the fastest sampler by up to $ 2 \times $. We then focused on distributed environments in which thousands of workers, each with different performance (due to virtualization and resource sharing), coordinate to train a topic model. Our second contribution is an asymmetric parameter server architecture that pushes some computation to the parameter server side. This architecture is motivated by the skew of the word frequency distribution and a novel tradeoff we discovered between communication and computation. With this architecture, we outperform the traditional, symmetric architecture by up to $ 2 \times $. With these two contributions, together with a carefully engineered implementation, our system is able to outperform existing systems by up to $ 10 \times $ and has already been running to provide topic modeling services for more than six months.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kabiljo:2017:SHP, author = "Igor Kabiljo and Brian Karrer and Mayank Pundir and Sergey Pupyrev and Alon Shalita", title = "Social hash partitioner: a scalable distributed hypergraph partitioner", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1418--1429", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We design and implement a distributed algorithm for balanced $k$-way hypergraph partitioning that minimizes fanout, a fundamental hypergraph quantity also known as the communication volume and $ (k - 1)$-cut metric, by optimizing a novel objective called probabilistic fanout. This choice allows a simple local search heuristic to achieve comparable solution quality to the best existing hypergraph partitioners. Our algorithm is arbitrarily scalable due to a careful design that controls computational complexity, space complexity, and communication. In practice, we commonly process hypergraphs with billions of vertices and hyperedges in a few hours. We explain how the algorithm's scalability, both in terms of hypergraph size and bucket count, is limited only by the number of machines available. We perform an extensive comparison to existing distributed hypergraph partitioners and find that our approach is able to optimize hypergraphs roughly 100 times bigger on the same set of machines. We call the resulting tool Social Hash Partitioner, and accompanying this paper, we open-source the most scalable version based on recursive bisection.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ahmed:2017:SMG, author = "Nesreen K. Ahmed and Nick Duffield and Theodore L. Willke and Ryan A. Rossi", title = "On sampling from massive graph streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1430--1441", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose Graph Priority Sampling (gps), a new paradigm for order-based reservoir sampling from massive graph streams. gps provides a general way to weight edge sampling according to auxiliary and/or size variables so as to accomplish various estimation goals of graph properties. In the context of subgraph counting, we show how edge sampling weights can be chosen so as to minimize the estimation variance of counts of specified sets of subgraphs. In distinction with many prior graph sampling schemes, gps separates the functions of edge sampling and subgraph estimation. We propose two estimation frameworks: (1) Post-Stream estimation, to allow gps to construct a reference sample of edges to support retrospective graph queries, and (2) In-Stream estimation, to allow gps to obtain lower variance estimates by incrementally updating the subgraph count estimates during stream processing. Unbiasedness of subgraph estimators is established through a new Martingale formulation of graph stream order sampling, in which subgraph estimators, written as a product of constituent edge estimators, are unbiased, even when computed at different points in the stream. The separation of estimation and sampling enables significant resource savings relative to previous work. We illustrate our framework with applications to triangle and wedge counting. We perform a large-scale experimental study on real-world graphs from various domains and types. gps achieves high accuracy with < 1\% error for triangle and wedge counting, while storing a small fraction of the graph with average update times of a few microseconds per edge. Notably, for billion-scale graphs, gps accurately estimates triangle and wedge counts with < 1\% error, while storing a small fraction of < 0.01\% of the total edges in the graph.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2017:PSS, author = "Tong Yang and Yang Zhou and Hao Jin and Shigang Chen and Xiaoming Li", title = "Pyramid sketch: a sketch framework for frequency estimation of data streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1442--1453", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Sketch is a probabilistic data structure, and is used to store and query the frequency of any item in a given multiset. Due to its high memory efficiency, it has been applied to various fields in computer science, such as stream database, network traffic measurement, etc. The key metrics of sketches for data streams are accuracy, speed, and memory usage. Various sketches have been proposed, but they cannot achieve both high accuracy and high speed using limited memory, especially for skewed datasets. To address this issue, we propose a sketch framework, the Pyramid sketch, which can significantly improve accuracy as well as update and query speed. To verify the effectiveness and efficiency of our framework, we applied our framework to four typical sketches. Extensive experimental results show that the accuracy is improved up to 3.50 times, while the speed is improved up to 2.10 times. We have released our source codes at Github [1].", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ciaccia:2017:RSR, author = "Paolo Ciaccia and Davide Martinenghi", title = "Reconciling skyline and ranking queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1454--1465", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Traditionally, skyline and ranking queries have been treated separately as alternative ways of discovering interesting data in potentially large datasets. While ranking queries adopt a specific scoring function to rank tuples, skyline queries return the set of non-dominated tuples and are independent of attribute scales and scoring functions. Ranking queries are thus less general, but usually cheaper to compute and widely used in data management systems. We propose a framework to seamlessly integrate these two approaches by introducing the notion of restricted skyline queries (R-skylines). We propose R-skyline operators that generalize both skyline and ranking queries by applying the notion of dominance to a set of scoring functions of interest. Such sets can be characterized, e.g., by imposing constraints on the function's parameters, such as the weights in a linear scoring function. We discuss the formal properties of these new operators, show how to implement them efficiently, and evaluate them on both synthetic and real datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Giannakopoulou:2017:COQ, author = "Stella Giannakopoulou and Manos Karpathiotakis and Benjamin Gaidioz and Anastasia Ailamaki", title = "{CleanM}: an optimizable query language for unified scale-out data cleaning", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1466--1477", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data cleaning has become an indispensable part of data analysis due to the increasing amount of dirty data. Data scientists spend most of their time preparing dirty data before it can be used for data analysis. At the same time, the existing tools that attempt to automate the data cleaning procedure typically focus on a specific use case and operation. Still, even such specialized tools exhibit long running times or fail to process large datasets. Therefore, from a user's perspective, one is forced to use a different, potentially inefficient tool for each category of errors. This paper addresses the coverage and efficiency problems of data cleaning. It introduces CleanM (pronounced clean'em), a language which can express multiple types of cleaning operations. CleanM goes through a three-level translation process for optimization purposes; a different family of optimizations is applied in each abstraction level. Thus, CleanM can express complex data cleaning tasks, optimize them in a unified way, and deploy them in a scaleout fashion. We validate the applicability of CleanM by using it on top of CleanDB, a newly designed and implemented framework which can query heterogeneous data. When compared to existing data cleaning solutions, CleanDB (a) covers more data corruption cases, (b) scales better, and can handle cases for which its competitors are unable to terminate, and (c) uses a single interface for querying and for data cleaning.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xie:2017:DTS, author = "Dong Xie and Feifei Li and Jeff M. Phillips", title = "Distributed trajectory similarity search", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1478--1489", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Mobile and sensing devices have already become ubiquitous. They have made tracking moving objects an easy task. As a result, mobile applications like Uber and many IoT projects have generated massive amounts of trajectory data that can no longer be processed by a single machine efficiently. Among the typical query operations over trajectories, similarity search is a common yet expensive operator in querying trajectory data. It is useful for applications in different domains such as traffic and transportation optimizations, weather forecast and modeling, and sports analytics. It is also a fundamental operator for many important mining operations such as clustering and classification of trajectories. In this paper, we propose a distributed query framework to process trajectory similarity search over a large set of trajectories. We have implemented the proposed framework in Spark, a popular distributed data processing engine, by carefully considering different design choices. Our query framework supports both the Hausdorff distance the Fr{\'e}chet distance. Extensive experiments have demonstrated the excellent scalability and query efficiency achieved by our design, compared to other methods and design alternatives.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chandra:2017:ROJ, author = "Bikash Chandra and S. Sudarshan", title = "Runtime optimization of join location in parallel data management systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1490--1501", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Applications running on parallel systems often need to join a streaming relation or a stored relation with data indexed in a parallel data storage system. Some applications also compute UDFs on the joined tuples. The join can be done at the data storage nodes, corresponding to reduce side joins, or by fetching data from the storage system to compute nodes, corresponding to map side join. Both may be suboptimal: reduce side joins may cause skew, while map side joins may lead to a lot of data being transferred and replicated. In this paper, we present techniques to make runtime decisions between the two options on a per key basis, in order to improve the throughput of the join, accounting for UDF computation if any. Our techniques are based on an extended ski-rental algorithm and provide worst-case performance guarantees with respect to the optimal point in the space considered by us. Our techniques use load balancing taking into account the CPU, network and I/O costs as well as the load on compute and storage nodes. We have implemented our techniques on Hadoop, Spark and the Muppet stream processing engine. Our experiments show that our optimization techniques provide a significant improvement in throughput over existing techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lehmberg:2017:SWT, author = "Oliver Lehmberg and Christian Bizer", title = "Stitching web tables for improving matching quality", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1502--1513", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "HTML tables on web pages (``web tables'') cover a wide variety of topics. Data from web tables can thus be useful for tasks such as knowledge base completion or ad hoc table extension. Before table data can be used for these tasks, the tables must be matched to the respective knowledge base or base table. The challenges of web table matching are the high heterogeneity and the small size of the tables. Though it is known that the majority of web tables are very small, the gold standards that are used to compare web table matching systems mostly consist of larger tables. In this experimental paper, we evaluate T2K Match, a web table to knowledge base matching system, and COMA, a standard schema matching tool, using a sample of web tables that is more realistic than the gold standards that were previously used. We find that both systems fail to produce correct results for many of the very small tables in the sample. As a remedy, we propose to stitch (combine) the tables from each web site into larger ones and match these enlarged tables to the knowledge base or base table afterwards. For this stitching process, we evaluate different schema matching methods in combination with holistic correspondence refinement. Limiting the stitching procedure to web tables from the same web site decreases the heterogeneity and allows us to stitch tables with very high precision. Our experiments show that applying table stitching before running the actual matching method improves the matching results by 0.38 in F1-measure for T2K Match and by 0.14 for COMA. Also, stitching the tables allows us to reduce the amount of tables in our corpus from 5 million original web tables to as few as 100,000 stitched tables.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shekelyan:2017:DHB, author = "Michael Shekelyan and Anton Dign{\"o}s and Johann Gamper", title = "{DigitHist}: a histogram-based data summary with tight error bounds", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1514--1525", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose DigitHist, a histogram summary for selectivity estimation on multi-dimensional data with tight error bounds. By combining multi-dimensional and one-dimensional histograms along regular grids of different resolutions, DigitHist provides an accurate and reliable histogram approach for multi-dimensional data. To achieve a compact summary, we use a sparse representation combined with a novel histogram compression technique that chooses a higher resolution in dense regions and a lower resolution elsewhere. For the construction of DigitHist, we propose a new error measure, termed u -error, which minimizes the width between the guaranteed upper and lower bounds of the selectivity estimate. The construction algorithm performs a single data scan and has linear time complexity. An in-depth experimental evaluation shows that DigitHist delivers superior precision and error bounds than state-of-the-art competitors at a comparable query time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pilman:2017:FSK, author = "Markus Pilman and Kevin Bocksrocker and Lucas Braun and Renato Marroqu{\'\i}n and Donald Kossmann", title = "Fast scans on key--value stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1526--1537", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Key-Value Stores (KVS) are becoming increasingly popular because they scale up and down elastically, sustain high throughputs for get/put workloads and have low latencies. KVS owe these advantages to their simplicity. This simplicity, however, comes at a cost: It is expensive to process complex, analytical queries on top of a KVS because today's generation of KVS does not support an efficient way to scan the data. The problem is that there are conflicting goals when designing a KVS for analytical queries and for simple get/put workloads: Analytical queries require high locality and a compact representation of data whereas elastic get/put workloads require sparse indexes. This paper shows that it is possible to have it all, with reasonable compromises. We studied the KVS design space and built TellStore, a distributed KVS, that performs almost as well as state-of-the-art KVS for get/put workloads and orders of magnitude better for analytical and mixed workloads. This paper presents the results of comprehensive experiments with an extended version of the YCSB benchmark and a workload from the telecommunication industry.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lu:2017:FMC, author = "Can Lu and Jeffrey Xu Yu and Hao Wei and Yikai Zhang", title = "Finding the maximum clique in massive graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1538--1549", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cliques refer to subgraphs in an undirected graph such that vertices in each subgraph are pairwise adjacent. The maximum clique problem, to find the clique with most vertices in a given graph, has been extensively studied. Besides its theoretical value as an NP-hard problem, the maximum clique problem is known to have direct applications in various fields, such as community search in social networks and social media, team formation in expert networks, gene expression and motif discovery in bioinformatics and anomaly detection in complex networks, revealing the structure and function of networks. However, algorithms designed for the maximum clique problem are expensive to deal with real-world networks. In this paper, we devise a randomized algorithm for the maximum clique problem. Different from previous algorithms that search from each vertex one after another, our approach RMC, for the randomized maximum clique problem, employs a binary search while maintaining a lower bound $ \omega_c $ and an upper bound [EQUATION] of $ \omega (G) $. In each iteration, RMC attempts to find a $ \omega_t $ -clique where [EQUATION]. As finding $ \omega_t $ in each iteration is NP-complete, we extract a seed set S such that the problem of finding a $ \omega_t$-clique in G is equivalent to finding a $ \omega_t$-clique in S with probability guarantees $ (\geq 1 - n^{-c})$. We propose a novel iterative algorithm to determine the maximum clique by searching a $k$-clique in $S$ starting from $ k = \omega_c + 1$ until $S$ becomes [EQUATION], when more iterations benefit marginally. As confirmed by the experiments, our approach is much more efficient and robust than previous solutions and can always find the exact maximum clique.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2017:PPN, author = "Yuankai Zhang and Adam O'Neill and Micah Sherr and Wenchao Zhou", title = "Privacy-preserving network provenance", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1550--1561", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Network accountability, forensic analysis, and failure diagnosis are becoming increasingly important for network management and security. Network provenance significantly aids network administrators in these tasks by explaining system behavior and revealing the dependencies between system states. Although resourceful, network provenance can sometimes be too rich, revealing potentially sensitive information that was involved in system execution. In this paper, we propose a cryptographic approach to preserve the confidentiality of provenance (sub)graphs while allowing users to query and access the parts of the graph for which they are authorized. Our proposed solution is a novel application of searchable symmetric encryption (SSE) and more generally structured encryption (SE). Our SE-enabled provenance system allows a node to enforce access control policies over its provenance data even after the data has been shipped to remote nodes (e.g., for optimization purposes). We present a prototype of our design and demonstrate its practicality, scalability, and efficiency for both provenance maintenance and querying.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Garcia-Ulloa:2017:TDS, author = "Daniel A. Garcia-Ulloa and Li Xiong and Vaidy Sunderam", title = "Truth discovery for spatio-temporal events from crowdsourced data", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1562--1573", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "One of the greatest challenges in spatial crowdsourcing is determining the veracity of reports from multiple users about a particular event or phenomenon. In this paper, we address the difficulties of truth discovery in spatio-temporal tasks and present a new method based on recursive Bayesian estimation (BE) from multiple reports of users. Our method incorporates a reliability model for users, which improves as more reports arrive while increasing the accuracy of the model in labeling the state of the event. The model is further improved by Kalman estimation (BE+KE) that models the spatio-temporal correlations of the events and predicts the next state of an event and is corrected when new reports arrive. The methods are tested in a simulated environment, as well as using real-world data. Experimental results show that our methods are adaptable to the available data, can incorporate previous beliefs, and outperform existing truth discovery methods of spatio-temporal events.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Trummer:2017:DVO, author = "Immanuel Trummer and Jiancheng Zhu and Mark Bryan", title = "Data vocalization: optimizing voice output of relational data", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1574--1585", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Research on data visualization aims at finding the best way to present data via visual interfaces. We introduce the complementary problem of ``data vocalization''. Our goal is to present relational data in the most efficient way via voice output. This problem setting is motivated by emerging tools and devices (e.g., Google Home, Amazon Echo, Apple's Siri, or voice-based SQL interfaces) that communicate data primarily via audio output to their users. We treat voice output generation as an optimization problem. The goal is to minimize speaking time while transmitting an approximation of a relational table to the user. We consider constraints on the precision of the transmitted data as well as on the cognitive load placed on the listener. We formalize voice output optimization and show that it is NP-hard. We present three approaches to solve that problem. First, we show how the problem can be translated into an integer linear program which enables us to apply corresponding solvers. Second, we present a two-phase approach that forms groups of similar rows in a pre-processing step, using a variant of the apriori algorithm. Then, we select an optimal combination of groups to generate a speech. Finally, we present a greedy algorithm that runs in polynomial time. Under simplifying assumptions, we prove that it generates near-optimal output by leveraging the sub-modularity property of our cost function. We compare our algorithms experimentally and analyze their complexity.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kang:2017:NON, author = "Daniel Kang and John Emmons and Firas Abuzaid and Peter Bailis and Matei Zaharia", title = "{NoScope}: optimizing neural network queries over video at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "11", pages = "1586--1597", month = aug, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Sep 5 16:07:00 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent advances in computer vision---in the form of deep neural networks---have made it possible to query increasing volumes of video data with high accuracy. However, neural network inference is computationally expensive at scale: applying a state-of-the-art object detector in real time (i.e., 30+ frames per second) to a single video requires a \$4000 GPU. In response, we present NoScope, a system for querying videos that can reduce the cost of neural network video analysis by up to three orders of magnitude via inference-optimized model search. Given a target video, object to detect, and reference neural network, NoScope automatically searches for and trains a sequence, or cascade, of models that preserves the accuracy of the reference network but is specialized to the target video and are therefore far less computationally expensive. NoScope cascades two types of models: specialized models that forego the full generality of the reference model but faithfully mimic its behavior for the target video and object; and difference detectors that highlight temporal differences across frames. We show that the optimal cascade architecture differs across videos and objects, so NoScope uses an efficient cost-based optimizer to search across models and cascades. With this approach, NoScope achieves two to three order of magnitude speed-ups (265--15,500$ \times $ real-time) on binary classification tasks over fixed-angle webcam and surveillance video while maintaining accuracy within 1--5\% of state-of-the-art neural networks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lee:2017:PRA, author = "Juchang Lee and SeungHyun Moon and Kyu Hwan Kim and Deok Hoe Kim and Sang Kyun Cha and Wook-Shin Han", title = "Parallel replication across formats in {SAP HANA} for scaling out mixed {OLTP\slash OLAP} workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1598--1609", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137767", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern in-memory database systems are facing the need of efficiently supporting mixed workloads of OLTP and OLAP. A conventional approach to this requirement is to rely on ETL-style, application-driven data replication between two very different OLTP and OLAP systems, sacrificing real-time reporting on operational data. An alternative approach is to run OLTP and OLAP workloads in a single machine, which eventually limits the maximum scalability of OLAP query performance. In order to tackle this challenging problem, we propose a novel database replication architecture called Asynchronous Parallel Table Replication (ATR). ATR supports OLTP workloads in one primary machine, while it supports heavy OLAP workloads in replicas. Here, row-store formats can be used for OLTP transactions at the primary, while column-store formats are used for OLAP analytical queries at the replicas. ATR is designed to support elastic scalability of OLAP query performance while it minimizes the overhead for transaction processing at the primary and minimizes CPU consumption for replayed transactions at the replicas. ATR employs a novel optimistic lock-free parallel log replay scheme which exploits characteristics of multi-version concurrency control (MVCC) in order to enable real-time reporting by minimizing the propagation delay between the primary and replicas. Through extensive experiments with a concrete implementation available in a commercial database system, we demonstrate that ATR achieves sub-second visibility delay even for update-intensive workloads, providing scalable OLAP performance without notable overhead to the primary.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shamsuddin:2017:DLD, author = "Rittika Shamsuddin and Amit Sawant and Balakrishnan Prabhakaran", title = "Developing a low dimensional patient class profile in accordance to their respiration-induced tumor motion", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1610--1621", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137768", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Tumor location displacement caused by respiration-induced motion reduces the efficacy of radiation therapy. Three medically relevant patterns are often observed in the respiration-induced motion signal: baseline shift, ES-Range shift, and D-Range shift. In this paper, for patients with lower body cancer, we develop class profiles (a low dimensional pattern frequency structure) that characterize them in terms of these three medically relevant patterns. We propose an adaptive segmentation technique that turns each respiration-induced motion signal into a multi-set of segments based on persistent variations within the signal. These multi-sets of segments is then probed for base behaviors. These base behaviors are then used to develop the group/class profiles using a modified version of the clustering technique described in [1]. Finally, via quantitative analysis, we provide a medical characterization for the class profiles, which can be used to explore breathing intervention technique. We show that, with (i) carefully designed feature sets, (ii) the proposed adaptive segmentation technique, (iii) the reasonable modifications to an existing clustering algorithm for multi-sets, and (iv) the proposed medical characterization methodology, it is possible to reduce the time series respiration-induced motion signals into a compact class profile. One of our co-authors is a medical physician and we used his expert opinion to verify the results.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ziauddin:2017:DBD, author = "Mohamed Ziauddin and Andrew Witkowski and You Jung Kim and Dmitry Potapov and Janaki Lahorani and Murali Krishna", title = "Dimensions based data clustering and zone maps", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1622--1633", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137769", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In recent years, the data warehouse industry has witnessed decreased use of indexing but increased use of compression and clustering of data facilitating efficient data access and data pruning in the query processing area. A classic example of data pruning is the partition pruning, which is used when table data is range or list partitioned. But lately, techniques have been developed to prune data at a lower granularity than a table partition or sub-partition. A good example is the use of data pruning structure called zone map. A zone map prunes zones of data from a table on which it is defined. Data pruning via zone map is very effective when the table data is clustered by the filtering columns. The database industry has offered support to cluster data in tables by its local columns, and to define zone maps on clustering columns of such tables. This has helped improve the performance of queries that contain filter predicates on local columns. However, queries in data warehouses are typically based on star/snowflake schema with filter predicates usually on columns of the dimension tables joined to a fact table. Given this, the performance of data warehouse queries can be significantly improved if the fact table data is clustered by columns of dimension tables together with zone maps that maintain min/max value ranges of these clustering columns over zones of fact table data. In recognition of this opportunity of significantly improving the performance of data warehouse queries, Oracle 12c release 1 has introduced the support for dimension based clustering of fact tables together with data pruning of the fact tables via dimension based zone maps.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Noghabi:2017:SSS, author = "Shadi A. Noghabi and Kartik Paramasivam and Yi Pan and Navina Ramesh and Jon Bringhurst and Indranil Gupta and Roy H. Campbell", title = "{Samza}: stateful scalable stream processing at {LinkedIn}", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1634--1645", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137770", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed stream processing systems need to support stateful processing, recover quickly from failures to resume such processing, and reprocess an entire data stream quickly. We present Apache Samza, a distributed system for stateful and fault-tolerant stream processing. Samza utilizes a partitioned local state along with a low-overhead background changelog mechanism, allowing it to scale to massive state sizes (hundreds of TB) per application. Recovery from failures is sped up by re-scheduling based on Host Affinity. In addition to processing infinite streams of events, Samza supports processing a finite dataset as a stream, from either a streaming source (e.g., Kafka), a database snapshot (e.g., Databus), or a file system (e.g. HDFS), without having to change the application code (unlike the popular Lambda-based architectures which necessitate maintenance of separate code bases for batch and stream path processing). Samza is currently in use at LinkedIn by hundreds of production applications with more than 10, 000 containers. Samza is an open-source Apache project adopted by many top-tier companies (e.g., LinkedIn, Uber, Netflix, TripAdvisor, etc.). Our experiments show that Samza: (a) handles state efficiently, improving latency and throughput by more than 100X compared to using a remote storage; (b) provides recovery time independent of state size; (c) scales performance linearly with number of containers; and (d) supports reprocessing of the data stream quickly and with minimal interference on real-time traffic.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Falk:2017:QAK, author = "Eric Falk and Vijay K. Gurbani and Radu State", title = "Query-able {Kafka}: an agile data analytics pipeline for mobile wireless networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1646--1657", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137771", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Due to their promise of delivering real-time network insights, today's streaming analytics platforms are increasingly being used in the communications networks where the impact of the insights go beyond sentiment and trend analysis to include real-time detection of security attacks and prediction of network state (i.e., is the network transitioning towards an outage). Current streaming analytics platforms operate under the assumption that arriving traffic is to the order of kilobytes produced at very high frequencies. However, communications networks, especially the telecommunication networks, challenge this assumption because some of the arriving traffic in these networks is to the order of gigabytes, but produced at medium to low velocities. Furthermore, these large datasets may need to be ingested in their entirety to render network insights in real-time. Our interest is to subject today's streaming analytics platforms --- constructed from state-of-the art software components (Kafka, Spark, HDFS, ElasticSearch) --- to traffic densities observed in such communications networks. We find that filtering on such large datasets is best done in a common upstream point instead of being pushed to, and repeated, in downstream components. To demonstrate the advantages of such an approach, we modify Apache Kafka to perform limited native data transformation and filtering, relieving the downstream Spark application from doing this. Our approach outperforms four prevalent analytics pipeline architectures with negligible overhead compared to standard Kafka. (Our modifications to Apache Kafka are publicly available at https://github.com/Esquive/queryable-kafka.git)", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nica:2017:SDS, author = "Anisoara Nica and Reza Sherkat and Mihnea Andrei and Xun Cheng and Martin Heidel and Christian Bensberg and Heiko Gerwens", title = "{Statisticum}: data statistics management in {SAP HANA}", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1658--1669", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137772", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We introduce a new concept of leveraging traditional data statistics as dynamic data integrity constraints. These data statistics produce transient database constraints, which are valid as long as they can be proven to be consistent with the current data. We denote this type of data statistics by constraint data statistics, their properties needed for consistency checking by consistency metadata, and their implied integrity constraints by implied data statistics constraints (implied constraints for short). Implied constraints are valid integrity constraints which are powerful query optimization tools employed, just as traditional database constraints, in semantic query transformation (aka query reformulation), partition pruning, runtime optimization, and semi-join reduction, to name a few. To our knowledge, this is the first work introducing this novel and powerful concept of deriving implied integrity constraints from data statistics. We discuss theoretical aspects of the constraint data statistics concept and their integration into query processing. We present the current architecture of data statistics management in SAP HANA and detail how constraint data statistics are designed and integrated into this architecture. As an instantiation of this framework, we consider dynamic partition pruning for data aging scenarios. We discuss our current implementation for constraint data statistics objects in SAP HANA which can be used for dynamic partition pruning. We enumerate their properties and show how consistency checking for implied integrity constraints is supported in the data statistics architecture. Our experimental evaluations on the TPC-H benchmark and a real customer application confirm the effectiveness of the implied integrity constraints; (1) for 59\% of TPC-H queries, constraint data statistics utilization results in pruning cold partitions and reducing memory consumption, and (2) we observe up to 3 orders of magnitude speed-up in query processing time, for a real customer running an S/4HANA application.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gessert:2017:QQW, author = "Felix Gessert and Michael Schaarschmidt and Wolfram Wingerath and Erik Witt and Eiko Yoneki and Norbert Ritter", title = "{Quaestor}: query web caching for database-as-a-service providers", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1670--1681", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137773", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Today, web performance is primarily governed by round-trip latencies between end devices and cloud services. To improve performance, services need to minimize the delay of accessing data. In this paper, we propose a novel approach to low latency that relies on existing content delivery and web caching infrastructure. The main idea is to enable application-independent caching of query results and records with tunable consistency guarantees, in particular bounded staleness. Quaestor (Query Store) employs two key concepts to incorporate both expiration-based and invalidation-based web caches: (1) an Expiring Bloom Filter data structure to indicate potentially stale data, and (2) statistically derived cache expiration times to maximize cache hit rates. Through a distributed query invalidation pipeline, changes to cached query results are detected in real-time. The proposed caching algorithms offer a new means for data-centric cloud services to trade latency against staleness bounds, e.g. in a database-as-a-service. Quaestor is the core technology of the backend-as-a-service platform Baqend, a cloud service for low-latency websites. We provide empirical evidence for Quaestor's scalability and performance through both simulation and experiments. The results indicate that for read-heavy workloads, up to tenfold speed-ups can be achieved through Quaestor's caching.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gasiunas:2017:FBA, author = "Vaidas Gasiunas and David Dominguez-Sal and Ralph Acker and Aharon Avitzur and Ilan Bronshtein and Rushan Chen and Eli Ginot and Norbert Martinez-Bazan and Michael M{\"u}ller and Alexander Nozdrin and Weijie Ou and Nir Pachter and Dima Sivov and Eliezer Levy", title = "Fiber-based architecture for {NFV} cloud databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1682--1693", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137774", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The telco industry is gradually shifting from using monolithic software packages deployed on custom hardware to using modular virtualized software functions deployed on cloudified data centers using commodity hardware. This transformation is referred to as Network Function Virtualization (NFV). The scalability of the databases (DBs) underlying the virtual network functions is the cornerstone for reaping the benefits from the NFV transformation. This paper presents an industrial experience of applying shared-nothing techniques in order to achieve the scalability of a DB in an NFV setup. The special combination of requirements in NFV DBs are not easily met with conventional execution models. Therefore, we designed a special shared-nothing architecture that is based on cooperative multi-tasking using user-level threads (fibers). We further show that the fiber-based approach outperforms the approach built using conventional multi-threading and meets the variable deployment needs of the NFV transformation. Furthermore, fibers yield a simpler-to-maintain software and enable controlling a trade-off between long-duration computations and real-time requests.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bose:2017:PDF, author = "Joos-Hendrik B{\"o}se and Valentin Flunkert and Jan Gasthaus and Tim Januschowski and Dustin Lange and David Salinas and Sebastian Schelter and Matthias Seeger and Yuyang Wang", title = "Probabilistic demand forecasting at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1694--1705", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137775", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present a platform built on large-scale, data-centric machine learning (ML) approaches, whose particular focus is demand forecasting in retail. At its core, this platform enables the training and application of probabilistic demand forecasting models, and provides convenient abstractions and support functionality for forecasting problems. The platform comprises of a complex end-to-end machine learning system built on Apache Spark, which includes data preprocessing, feature engineering, distributed learning, as well as evaluation, experimentation and ensembling. Furthermore, it meets the demands of a production system and scales to large catalogues containing millions of items. We describe the challenges of building such a platform and discuss our design decisions. We detail aspects on several levels of the system, such as a set of general distributed learning schemes, our machinery for ensembling predictions, and a high-level dataflow abstraction for modeling complex ML pipelines. To the best of our knowledge, we are not aware of prior work on real-world demand forecasting systems which rivals our approach in terms of scalability.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lee:2017:EBG, author = "Jinho Lee and Heesu Kim and Sungjoo Yoo and Kiyoung Choi and H. Peter Hofstee and Gi-Joon Nam and Mark R. Nutter and Damir Jamsek", title = "{ExtraV}: boosting graph processing near storage with a coherent accelerator", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1706--1717", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137776", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we propose ExtraV, a framework for near-storage graph processing. It is based on the novel concept of graph virtualization, which efficiently utilizes a cache-coherent hardware accelerator at the storage side to achieve performance and flexibility at the same time. ExtraV consists of four main components: (1) host processor, (2) main memory, (3) AFU (Accelerator Function Unit) and (4) storage. The AFU, a hardware accelerator, sits between the host processor and storage. Using a coherent interface that allows main memory accesses, it performs graph traversal functions that are common to various algorithms while the program running on the host processor (called the host program) manages the overall execution along with more application-specific tasks. Graph virtualization is a high-level programming model of graph processing that allows designers to focus on algorithm-specific functions. Realized by the accelerator, graph virtualization gives the host programs an illusion that the graph data reside on the main memory in a layout that fits with the memory access behavior of host programs even though the graph data are actually stored in a multi-level, compressed form in storage. We prototyped ExtraV on a Power8 machine with a CAPI-enabled FPGA. Our experiments on a real system prototype offer significant speedup compared to state-of-the-art software only implementations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Carbone:2017:SMA, author = "Paris Carbone and Stephan Ewen and Gyula F{\'o}ra and Seif Haridi and Stefan Richter and Kostas Tzoumas", title = "State management in {Apache Flink\reg}: consistent stateful distributed stream processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1718--1729", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137777", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Stream processors are emerging in industry as an apparatus that drives analytical but also mission critical services handling the core of persistent application logic. Thus, apart from scalability and low-latency, a rising system need is first-class support for application state together with strong consistency guarantees, and adaptivity to cluster reconfigurations, software patches and partial failures. Although prior systems research has addressed some of these specific problems, the practical challenge lies on how such guarantees can be materialized in a transparent, non-intrusive manner that relieves the user from unnecessary constraints. Such needs served as the main design principles of state management in Apache Flink, an open source, scalable stream processor. We present Flink's core pipelined, in-flight mechanism which guarantees the creation of lightweight, consistent, distributed snapshots of application state, progressively, without impacting continuous execution. Consistent snapshots cover all needs for system reconfiguration, fault tolerance and version management through coarse grained rollback recovery. Application state is declared explicitly to the system, allowing efficient partitioning and transparent commits to persistent storage. We further present Flink's backend implementations and mechanisms for high availability, external state queries and output commit. Finally, we demonstrate how these mechanisms behave in practice with metrics and large-deployment insights exhibiting the low performance trade-offs of our approach and the general benefits of exploiting asynchrony in continuous, yet sustainable system deployments.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zheng:2017:PHA, author = "Jianjun Zheng and Qian Lin and Jiatao Xu and Cheng Wei and Chuwei Zeng and Pingan Yang and Yunfan Zhang", title = "{PaxosStore}: high-availability storage made practical in {WeChat}", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1730--1741", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137778", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we present PaxosStore, a high-availability storage system developed to support the comprehensive business of WeChat. It employs a combinational design in the storage layer to engage multiple storage engines constructed for different storage models. PaxosStore is characteristic of extracting the Paxos-based distributed consensus protocol as a middleware that is universally accessible to the underlying multi-model storage engines. This facilitates tuning, maintaining, scaling and extending the storage engines. According to our experience in engineering practice, to achieve a practical consistent read/write protocol is far more complex than its theory. To tackle such engineering complexity, we propose a layered design of the Paxos-based storage protocol stack, where PaxosLog, the key data structure used in the protocol, is devised to bridge the programming-oriented consistent read/write to the storage-oriented Paxos procedure. Additionally, we present optimizations based on Paxos that made fault-tolerance more efficient. Discussion throughout the paper primarily focuses on pragmatic solutions that could be insightful for building practical distributed storage systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Antonopoulos:2017:ROI, author = "Panagiotis Antonopoulos and Hanuma Kodavalla and Alex Tran and Nitish Upreti and Chaitali Shah and Mirek Sztajno", title = "Resumable online index rebuild in {SQL} server", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1742--1753", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137779", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Azure SQL Database and the upcoming release of SQL Server enhance Online Index Rebuild to provide fault-tolerance and allow index rebuild operations to resume after a system failure or a user-initiated pause. SQL Server is the first commercial DBMS to support pause and resume functionality for index rebuilds. This is achieved by splitting the operation into incremental units of work and persisting the required state so that it can be resumed later with minimal loss of progress. At the same time, the proposed technology minimizes the log space required for the operation to succeed, making it possible to rebuild large indexes using only a small, constant amount of log space. These capabilities are critical to guarantee the reliability of these operations in an environment where (a) the database sizes are increasing at a much faster pace compared to the available hardware, (b) system failures are frequent in Cloud architectures using commodity hardware, (c) software upgrades and other maintenance tasks are automatically handled by the Cloud platforms, introducing further unexpected failures for the users and (d) most modern applications need to be available 24/7 and have very tight maintenance windows. This paper describes the design of ``Resumable Online Index Rebuild'' and discusses how this technology can be extended to cover more schema management operations in the future.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Andrei:2017:SHA, author = "Mihnea Andrei and Christian Lemke and G{\"u}nter Radestock and Robert Schulze and Carsten Thiel and Rolando Blanco and Akanksha Meghlan and Muhammad Sharique and Sebastian Seifert and Surendra Vishnoi and Daniel Booss and Thomas Peh and Ivan Schreter and Werner Thesing and Mehul Wagle and Thomas Willhalm", title = "{SAP HANA} adoption of non-volatile memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1754--1765", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137780", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Non-Volatile RAM (NVRAM) is a novel class of hardware technology which is an interesting blend of two storage paradigms: byte-addressable DRAM and block-addressable storage (e.g. HDD/SSD). Most of the existing enterprise relational data management systems such as SAP HANA have their internal architecture based on the inherent assumption that memory is volatile and base their persistence on explicit handling of block-oriented storage devices. In this paper, we present the early adoption of Non-Volatile Memory within the SAP HANA Database, from the architectural and technical angles. We discuss our architectural choices, dive deeper into a few challenges of the NVRAM integration and their solutions, and share our experimental results. As we present our solutions for the NVRAM integration, we also give, as a basis, a detailed description of the relevant HANA internals.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2017:CIS, author = "Mingming Zhang and Tianyu Wo and Tao Xie and Xuelian Lin and Yaxiao Liu", title = "{CarStream}: an industrial system of big data processing for {Internet-of-Vehicles}", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1766--1777", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137781", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As the Internet-of-Vehicles (IoV) technology becomes an increasingly important trend for future transportation, designing large-scale IoV systems has become a critical task that aims to process big data uploaded by fleet vehicles and to provide data-driven services. The IoV data, especially high-frequency vehicle statuses (e.g., location, engine parameters), are characterized as large volume with a low density of value and low data quality. Such characteristics pose challenges for developing real-time applications based on such data. In this paper, we address the challenges in designing a scalable IoV system by describing CarStream, an industrial system of big data processing for chauffeured car services. Connected with over 30,000 vehicles, CarStream collects and processes multiple types of driving data including vehicle status, driver activity, and passenger-trip information. Multiple services are provided based on the collected data. CarStream has been deployed and maintained for three years in industrial usage, collecting over 40 terabytes of driving data. This paper shares our experiences on designing CarStream based on large-scale driving-data streams, and the lessons learned from the process of addressing the challenges in designing and maintaining CarStream.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bonetta:2017:FJF, author = "Daniele Bonetta and Matthias Brantner", title = "{FAD.js}: fast {JSON} data access using {JIT}-based speculative optimizations", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1778--1789", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137782", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "JSON is one of the most popular data encoding formats, with wide adoption in Databases and BigData frameworks as well as native support in popular programming languages such as JavaScript/Node.js, Python, and R. Nevertheless, JSON data processing can easily become a performance bottleneck in data-intensive applications because of parse and serialization overhead. In this paper, we introduce F ad.js, a runtime system for efficient processing of JSON objects in data-intensive applications. Fad.js is based on (1) speculative just-in-time (JIT) compilation and (2) selective access to data. Experiments show that applications using Fad.js achieve speedups up to 2.7x for encoding and 9.9x for decoding JSON data when compared to state-of-the art JSON processing libraries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Aggour:2017:CCL, author = "Kareem S. Aggour and Jenny Weisenberg Williams and Justin McHugh and Vijay S. Kumar", title = "{Colt}: concept lineage tool for data flow metadata capture and analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1790--1801", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137783", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Most organizations are becoming increasingly data-driven, often processing data from many different sources to enable critical business operations. Beyond the well-addressed challenge of storing and processing large volumes of data, financial institutions in particular are increasingly subject to federal regulations requiring high levels of accountability for the accuracy and lineage of this data. For companies like GE Capital, which maintain data across a globally interconnected network of thousands of systems, it is becoming increasingly challenging to capture an accurate understanding of the data flowing between those systems. To address this problem, we designed and developed a concept lineage tool allowing organizational data flows to be modeled, visualized and interactively explored. This tool has novel features that allow a data flow network to be contextualized in terms of business-specific metadata such as the concept, business, and product for which it applies. Key analysis features have been implemented, including the ability to trace the origination of particular datasets, and to discover all systems where data is found that meets some user-defined criteria. This tool has been readily adopted by users at GE Capital and in a short time has already become a business-critical application, with over 2,200 data systems and over 1,000 data flows captured.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yeh:2017:MPI, author = "Chin-Chia Michael Yeh and Nickolas Kavantzas and Eamonn Keogh", title = "Matrix profile {IV}: using weakly labeled time series to predict outcomes", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1802--1812", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137784", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In academic settings over the last decade, there has been significant progress in time series classification. However, much of this work makes assumptions that are simply unrealistic for deployed industrial applications. Examples of these unrealistic assumptions include the following: assuming that data subsequences have a single fixed-length, are precisely extracted from the data, and are correctly labeled according to their membership in a set of equal-size classes. In real-world industrial settings, these patterns can be of different lengths, the class annotations may only belong to a general region of the data, may contain errors, and finally, the class distribution is typically highly skewed. Can we learn from such weakly labeled data? In this work, we introduce SDTS, a scalable algorithm that can learn in such challenging settings. We demonstrate the utility of our ideas by learning from diverse datasets with millions of datapoints. As we shall demonstrate, our domain-agnostic parameter-free algorithm can be competitive with domain-specific algorithms used in neuroscience and entomology, even when those algorithms have been tuned by domain experts to incorporate domain knowledge.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chakkappen:2017:ASO, author = "Sunil Chakkappen and Suratna Budalakoti and Ramarajan Krishnamachari and Satyanarayana R. Valluri and Alan Wood and Mohamed Zait", title = "Adaptive statistics in {Oracle 12c}", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1813--1824", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137785", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database Management Systems (DBMS) continue to be the foundation of mission critical applications, both OLTP and Analytics. They provide a safe, reliable and efficient platform to store and retrieve data. SQL is the lingua franca of the database world. A database developer writes a SQL statement to specify data sources and express the desired result and the DBMS will figure out the most efficient way to implement it. The query optimizer is the component in a DBMS responsible for finding the best execution plan for a given SQL statement based on statistics, access structures, location, and format. At the center of a query optimizer is a cost model that consumes the above information and helps the optimizer make decisions related to query transformations, join order, join methods, access paths, and data movement. The final execution plan produced by the query optimizer depends on the quality of information used by the cost model, as well as the sophistication of the cost model. In addition to statistics about the data, the cost model also relies on statistics generated internally for intermediate results, e.g. size of the output of a join operation. This paper presents the problems caused by incorrect statistics of intermediate results, survey the existing solutions and present our solution introduced in Oracle 12c. The solution includes validating the generated statistics using table data and via the automatic creation of auxiliary statistics structures. We limit the overhead of the additional work by confining their use to cases where it matters the most, caching the computed statistics, and using table samples. The statistics management is automated. We demonstrate the benefits of our approach based on experiments using two SQL workloads, a benchmark that uses data from the Internal Movie Data Base (IMDB) and a real customer workload.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Floratou:2017:DSR, author = "Avrilia Floratou and Ashvin Agrawal and Bill Graham and Sriram Rao and Karthik Ramasamy", title = "{Dhalion}: self-regulating stream processing in {Heron}", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1825--1836", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137786", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In recent years, there has been an explosion of large-scale real-time analytics needs and a plethora of streaming systems have been developed to support such applications. These systems are able to continue stream processing even when faced with hardware and software failures. However, these systems do not address some crucial challenges facing their operators: the manual, time-consuming and error-prone tasks of tuning various configuration knobs to achieve service level objectives (SLO) as well as the maintenance of SLOs in the face of sudden, unpredictable load variation and hardware or software performance degradation. In this paper, we introduce the notion of self-regulating streaming systems and the key properties that they must satisfy. We then present the design and evaluation of Dhalion, a system that provides self-regulation capabilities to underlying streaming systems. We describe our implementation of the Dhalion framework on top of Twitter Heron, as well as a number of policies that automatically reconfigure Heron topologies to meet throughput SLOs, scaling resource consumption up and down as needed. We experimentally evaluate our Dhalion policies in a cloud environment and demonstrate their effectiveness. We are in the process of open-sourcing our Dhalion policies as part of the Heron project.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhu:2017:INO, author = "Erkang Zhu and Ken Q. Pu and Fatemeh Nargesian and Ren{\'e}e J. Miller", title = "Interactive navigation of open data linkages", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1837--1840", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137788", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We developed Toronto Open Data Search to support the ad hoc, interactive discovery of connections or linkages between datasets. It can be used to efficiently navigate through the open data cloud. Our system consists of three parts: a user-interface provided by a Web application; a scalable backend infrastructure that supports navigational queries; and a dynamic repository of open data tables. Our system uses LSH Ensemble, an efficient index structure, to compute linkages (attributes in two datasets with high containment score) in real time at Internet scale. Our application allows users to navigate along these linkages by joining datasets. LSH Ensemble is scalable, providing millisecond response times for linkage discovery queries even over millions of datasets. Our system offers users a highly interactive experience making unrelated (and unlinked) dynamic collections of datasets appear as a richly connected cloud of data that can be navigated and combined easily in real time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pimentel:2017:NTC, author = "Jo{\~a}o Felipe Pimentel and Leonardo Murta and Vanessa Braganholo and Juliana Freire", title = "{noWorkflow}: a tool for collecting, analyzing, and managing provenance from {Python} scripts", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1841--1844", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137789", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/python.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present noWorkflow, an open-source tool that systematically and transparently collects provenance from Python scripts, including data about the script execution and how the script evolves over time. During the demo, we will show how noWorkflow collects and manages provenance, as well as how it supports the analysis of computational experiments. We will also encourage attendees to use noWorkflow for their own scripts.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2017:ACB, author = "Chao Wang and Yihao Feng and Qi Guo and Zhaoxian Li and Kexin Liu and Zijian Tang and Anthony K. H. Tung and Lifu Wu and Yuxin Zheng", title = "{ARShop}: a cloud-based augmented reality system for shopping", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1845--1848", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137790", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "ARShop is a one-stop solution for shopping in the cyber-physical world with the help of crowd knowledge and augmented reality. Its ultimate goal is to improve customers' shopping experience. When a customer enters a physical shop and snaps a shot, the enriched cyber information of the surroundings will pop up and be augmented on the screen. ARShop can also be the customer's personal shopping assistant who can show routes to the shops that the customer is interested in. In addition, ARShop provides merchants with a web-based interface to manage their shops and promote their business to customers, and provides customers with an Android App to query using images.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Aberger:2017:MGB, author = "Christopher R. Aberger and Andrew Lamb and Kunle Olukotun and Christopher R{\'e}", title = "Mind the gap: bridging multi-domain query workloads with {EmptyHeaded}", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1849--1852", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137791", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Executing domain specific workloads from a relational data warehouse is an increasingly popular task. Unfortunately, classic relational database management systems (RDBMS) are suboptimal in many domains (e.g., graph and linear algebra queries), and it is challenging to transfer data from an RDBMS to a domain specific toolkit in an efficient manner. This demonstration showcases the EmptyHeaded engine: an interactive query processing engine that leverages a novel query architecture to support efficient execution in multiple domains. To enable a unified design, the EmptyHeaded architecture is built around recent theoretical advancements in join processing and automated in-query data transformations. This demonstration highlights the strengths and weaknesses of this novel type of query processing architecture while showcasing its flexibility in multiple domains. In particular, attendees will use EmptyHeaded's Jupyter notebook front-end to interactively learn the theoretical advantages of this new (and largely unknown) approach and directly observe its performance impact in multiple domains.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Maccioni:2017:CFL, author = "Antonio Maccioni and Riccardo Torlone", title = "Crossing the finish line faster when paddling the data lake with {KAYAK}", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1853--1856", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137792", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Paddling in a data lake is strenuous for a data scientist. Being a loosely-structured collection of raw data with little or no meta-information available, the difficulties of extracting insights from a data lake start from the initial phases of data analysis. Indeed, data preparation, which involves many complex operations (such as source and feature selection, exploratory analysis, data profiling, and data curation), is a long and involved activity for navigating the lake before getting precious insights at the finish line. In this framework, we demonstrate KAYAK, a framework that supports data preparation in a data lake with ad-hoc primitives and allows data scientists to cross the finish line sooner. KAYAK takes into account the tolerance of the user in waiting for the primitives' results and it uses incremental execution strategies to produce informative previews of these results. The framework is based on a wise management of metadata and on features that limit human intervention, thus scaling smoothly when the data lake evolves.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Niu:2017:DTT, author = "Xing Niu and Bahareh Sadat Arab and Seokki Lee and Su Feng and Xun Zou and Dieter Gawlick and Vasudha Krishnaswamy and Zhen Hua Liu and Boris Glavic", title = "Debugging transactions and tracking their provenance with reenactment", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1857--1860", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137793", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Debugging transactions and understanding their execution are of immense importance for developing OLAP applications, to trace causes of errors in production systems, and to audit the operations of a database. However, debugging transactions is hard for several reasons: (1) after the execution of a transaction, its input is no longer available for debugging, (2) internal states of a transaction are typically not accessible, and (3) the execution of a transaction may be affected by concurrently running transactions. We present a debugger for transactions that enables non-invasive, postmortem debugging of transactions with provenance tracking and supports what-if scenarios (changes to transaction code or data). Using reenactment, a declarative replay technique we have developed, a transaction is replayed over the state of the DB seen by its original execution including all its interactions with concurrently executed transactions from the history. Importantly, our approach uses the temporal database and audit logging capabilities available in many DBMS and does not require any modifications to the underlying database system nor transactional workload.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2017:PES, author = "Kai Huang and Sourav S. Bhowmick and Shuigeng Zhou and Byron Choi", title = "{\tt picasso}: exploratory search of connected subgraph substructures in graph databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1861--1864", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137794", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently, exploratory search has received much attention in information retrieval and database fields. This search paradigm assists users who do not have a clear search intent and are unfamiliar with the underlying data space. Specifically, query formulation evolves iteratively as the user becomes more familiar with the content. Despite its growing importance, exploratory search on graph-structured data has received little attention in the literature. We demonstrate a system called {\tt picasso} to realize exploratory sub-structure search on a graph database containing a set of small or medium-sized data graphs. {\tt picasso} embodies several novel features such as progressive (i.e., iterative) formulation of queries visually and incremental processing, multi-stream results exploration wall to visualize, explore, and analyze search results to identify possible search directions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cai:2017:DDI, author = "Ruichu Cai and Zijie Lu and Li Wang and Zhenjie Zhang and Tom Z. J. Fur and Marianne Winslett", title = "{DITIR}: distributed index for high throughput trajectory insertion and real-time temporal range query", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1865--1868", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137795", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The prosperity of mobile social network and location-based services, e.g., Uber, is backing the explosive growth of spatial temporal streams on the Internet. It raises new challenges to the underlying data store system, which is supposed to support extremely high-throughput trajectory insertion and low-latency querying with spatial and temporal constraints. State-of-the-art solutions, e.g., HBase, do not render satisfactory performance, due to the high overhead on index update. In this demonstration, we present DITIR, our new system prototype tailored to efficiently processing temporal and spacial queries over historical data as well as latest updates. Our system provides better performance guarantee, by physically partitioning the incoming data tuples on their arrivals and exploiting a template-based insertion schema, to reach the desired ingestion throughput. Load balancing mechanism is also introduced to DITIR, by using which the system is capable of achieving reliable performance against workload dynamics. Our demonstration shows that DITIR supports over 1 million tuple insertions in a second, when running on a 10-node cluster. It also significantly outperforms HBase by 7 times on ingestion throughput and 5 times faster on query latency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pang:2017:FIV, author = "Zhifei Pang and Sai Wu and Gang Chen and Ke Chen and Lidan Shou", title = "{FlashView}: an interactive visual explorer for raw data", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1869--1872", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137796", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "New data has been generated in an unexpected high speed. To get insight of those data, data analysts will perform a thorough study using state-of-the-art big data analytical tools. Before the analysis starts, a preprocessing is conducted, where data analyst tends to issue a few ad-hoc queries on a new dataset to explore and gain a better understanding. However, it is costly to perform such ad-hoc queries on large scale data using traditional data management systems, e.g., DBMS, because data loading and indexing are very expensive. In this demo, we propose a novel visual data explorer system, FlashView, which omits the loading process by directly querying raw data. FlashView applies approximate query processing technique to achieve real-time query results. It builds both in-memory index and disk index to facilitate the data scanning. It also supports tracking and updating multiple queries concurrently. Note that FlashView is not designed as a replacement of full-fledged DBMS. Instead, it tries to help the analysts quickly understand the characteristics of data, so he/she can selectively load data into the DBMS to do more sophisticated analysis.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Subercaze:2017:UPT, author = "Julien Subercaze and Christophe Gravier and Syed Gillani and Abderrahmen Kammoun and Fr{\'e}d{\'e}rique Laforest", title = "{Upsortable}: programming top-$k$ queries over data streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1873--1876", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137797", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Top-$k$ queries over data streams is a well studied problem. There exists numerous systems allowing to process continuous queries over sliding windows. At the opposite, non-append only streams call for ad-hoc solutions, e.g. tailor-made solutions implemented in a mainstream programming language. In the meantime, the Stream API and lambda expressions have been added in Java 8, thus gaining powerful operations for data stream processing. However, the Java Collections Framework does not provide data structures to safely and conveniently support sorted collections of evolving data. In this paper, we demonstrate Upsortable, an annotation-based approach that allows to use existing sorted collections from the standard Java API for dynamic data management. Our approach relies on a combination of pre-compilation abstract syntax tree modifications and runtime analysis of bytecode. Upsortable offers the developer a safe and time-efficient solution for developing top-$k$ queries on data streams while keeping a full compatibility with standard Java.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chamanara:2017:QSH, author = "Javad Chamanara and Birgitta K{\"o}nig-Ries and H. V. Jagadish", title = "{QUIS}: in-situ heterogeneous data source querying", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1877--1880", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137798", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Existing data integration frameworks are poorly suited for the special requirements of scientists. To answer a specific research question, often, excerpts of data from different sources need to be integrated. The relevant parts and the set of underlying sources may differ from query to query. The analyses also oftentimes involve frequently changing data and exploratory querying. Additionally, The data sources not only store data in different formats, but also provide inconsistent data access functionality. The classic Extract-Transform-Load (ETL) approach seems too complex and time-consuming and does not fit well with interest and expertise of the scientists. With QUIS (QUery In-Situ), we provide a solution for this problem. QUIS is an open source heterogeneous in-situ data querying system. It utilizes a federated query virtualization approach that is built upon plugged-in adapters. QUIS takes a user query and transforms appropriate portions of it into the corresponding computation model on individual data sources and executes it. It complements the segments of the query that the target data sources can not execute. Hence, it guarantees full syntax and semantic support for its language on all data sources. QUIS's in-situ querying facility almost eliminates the time to prepare the data while maintaining a competitive performance and steady scalability. The present demonstration illustrates interesting features of the system: virtual Schemas, heterogeneous joins, and visual query results. We provide a realistic data processing scenario to examine the system's features. Users can interact with QUIS using its desktop workbench, command line interface, or from any R client including RStudio Server.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Alawini:2017:ADC, author = "Abdussalam Alawini and Susan B. Davidson and Wei Hu and Yinjun Wu", title = "Automating data citation in {CiteDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1881--1884", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137799", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "An increasing amount of information is being collected in structured, evolving, curated databases, driving the question of how information extracted from such datasets via queries should be cited. While several databases say how data should be cited for web-page views of the database, they leave it to users to manually construct the citations. Furthermore, they do not say how data extracted by queries other than web-page views --- general queries --- should be cited. This demo shows how citations can be specified for a small set of views of the database, and used to automatically generate citations for general queries against the database.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fang:2017:CEB, author = "Yixiang Fang and Reynold Cheng and Siqiang Luo and Jiafeng Hu and Kai Huang", title = "{C-explorer}: browsing communities in large graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1885--1888", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137800", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Community retrieval (CR) algorithms, which enable the extraction of subgraphs from large social networks (e.g., Facebook and Twitter), have attracted tremendous interest. Various CR solutions, such as k -core and codicil, have been proposed to obtain graphs whose vertices are closely related. In this paper, we propose the C-Explorer system to assist users in extracting, visualizing, and analyzing communities. C-Explorer provides online and interactive CR facilities, allowing a user to view her interesting graphs, indicate her required vertex q, and display the communities to which q belongs. A seminal feature of C-Explorer is that it uses an attributed graph, whose vertices are associated with labels and keywords, and looks for an attributed community (or AC), whose vertices are structurally and semantically related. Moreover, C-Explorer implements several state-of-the-art CR algorithms, as well as functions for analyzing their effectiveness. We plan to make C-Explorer an open-source web-based platform, and design API functions for software developers to test their CR algorithms in our system.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2017:GPS, author = "Wenfei Fan and Jingbo Xu and Yinghui Wu and Wenyuan Yu and Jiaxin Jiang", title = "{GRAPE}: parallelizing sequential graph computations", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1889--1892", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137801", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate GRAPE, a parallel GRAPh query Engine. GRAPE advocates a parallel model based on a simultaneous fixed point computation in terms of partial and incremental evaluation. It differs from prior systems in its ability to parallelize existing sequential graph algorithms as a whole, without the need for recasting the entire algorithms into a new model. One of its unique features is that under a monotonic condition, GRAPE parallelization guarantees to terminate with correct answers as long as the sequential algorithms ``plugged in'' are correct. We demonstrate its parallel computations, ease-of-use and performance compared with the start-of-the-art graph systems. We also demonstrate a use case of GRAPE in social media marketing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Khoshkbarforoushha:2017:FDA, author = "Alireza Khoshkbarforoushha and Rajiv Ranjan and Qing Wang and Carsten Friedrich", title = "{Flower}: a data analytics flow elasticity manager", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1893--1896", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137802", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A data analytics flow typically operates on three layers: ingestion, analytics, and storage, each of which is provided by a data-intensive system. These systems are often available as cloud managed services, enabling the users to have pain-free deployment of data analytics flow applications such as click-stream analytics. Despite straightforward orchestration, elasticity management of the flows is challenging. This is due to: (a) heterogeneity of workloads and diversity of cloud resources such as queue partitions, compute servers and NoSQL throughputs capacity, (b) workload dependencies between the layers, and (c) different performance behaviours and resource consumption patterns. In this demonstration, we present Flower, a holistic elasticity management system that exploits advanced optimization and control theory techniques to manage elasticity of complex data analytics flows on clouds. Flower analyzes statistics and data collected from different data-intensive systems to provide the user with a suite of rich functionalities, including: workload dependency analysis, optimal resource share analysis, dynamic resource provisioning, and cross-platform monitoring. We will showcase various features of Flower using a real-world data analytics flow. We will allow the audience to explore Flower by visually defining and configuring a data analytics flow elasticity manager and get hands-on experience with integrated data analytics flow management.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2017:SAD, author = "Zhiyi Wang and Dongyan Zhou and Shimin Chen", title = "{STEED}: an analytical database system for tree-structured data", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1897--1900", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137803", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Tree-structured data formats, such as JSON and Protocol Buffers, are capable of expressing sophisticated data types, including nested, repeated, and missing values. While such expressing power contributes to their popularity in real-world applications, it presents a significant challenge for systems supporting tree-structured data. Existing systems have focused on general-purpose solutions either extending RDBMSs or designing native systems. However, the general-purpose approach often results in sophisticated data structures and algorithms, which may not reflect and optimize for the actual structure patterns in the real world. In this demonstration, we showcase Steed, an analytical database System for tree-structured data. We use the insights gained by analyzing representative real-world tree structured data as guidelines in the design of Steed. Steed learns and extracts a schema tree for a data set and uses the schema tree to reduce the storage space and improve the efficiency of data field accesses. We observe that sub-structures in real world data are often simple, while the tree-structured data types can support very sophisticated structures. We optimize the storage structure, the column assembling algorithm, and the in-memory layout for the simple sub-structures (a.k.a. simple paths). Compared to representative state-of-the-art systems (i.e. PostgreSQL/JSON, MongoDB, and Hive+Parquet), Steed achieves orders of magnitude better performance for data analysis queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xiao:2017:LLC, author = "Yonghui Xiao and Li Xiong and Si Zhang and Yang Cao", title = "{LocLok}: location cloaking with differential privacy via hidden {Markov} model", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1901--1904", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137804", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate LocLok, a LOCation-cLOaKing system to protect the locations of a user with differential privacy. LocLok has two features: (a) it protects locations under temporal correlations described through hidden Markov model; (b) it releases the optimal noisy location with the planar isotropic mechanism (PIM), the first mechanism that achieves the lower bound of differential privacy. We show the detailed computation of LocLok with the following components: (a) how to generate the possible locations with Markov model, (b) how to perturb the location with PIM, and (c) how to make inference about the true location in Markov model. An online system with real-word dataset will be presented with the computation details.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ren:2017:SAI, author = "Xiangnan Ren and Olivier Cur{\'e} and Li Ke and Jeremy Lhez and Badre Belabbess and Tendry Randriamalala and Yufan Zheng and Gabriel Kepeklian", title = "{Strider}: an adaptive, inference-enabled distributed {RDF} stream processing engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1905--1908", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137805", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Real-time processing of data streams emanating from sensors is becoming a common task in industrial scenarios. An increasing number of processing jobs executed over such platforms are requiring reasoning mechanisms. The key implementation goal is thus to efficiently handle massive incoming data streams and support reasoning, data analytic services. Moreover, in an on-going industrial project on anomaly detection in large potable water networks, we are facing the effect of dynamically changing data and work characteristics in stream processing. The Strider system addresses these research and implementation challenges by considering scalability, fault-tolerance, high throughput and acceptable latency properties. We will demonstrate the benefits of Strider on an Internet of Things-based real world and industrial setting.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2017:CAT, author = "Yan Li and Ngai Meng Kou and Hao Wang and Leong Hou U. and Zhiguo Gong", title = "A confidence-aware top-$k$ query processing toolkit on crowdsourcing", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1909--1912", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137806", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Ranking techniques have been widely used in ubiquitous applications like recommendation, information retrieval, etc. For ranking computation hostile but human friendly items, crowdsourcing is considered as an emerging technique to process the ranking by human power. However, there is a lack of an easy-to-use toolkit for answering crowdsourced top- k query with minimal effort. In this work, we demonstrate an interactive programming toolkit that is a unified solution for answering the crowd-sourced top- k queries. The toolkit employs a new confidence-aware crowdsourced top- k algorithm, SPR. The whole progress of the algorithm is monitored and visualized to end users in a timely manner. Besides the visualized result and the statistics, the system also reports the estimation of the monetary cost and the breakdown of each phase. Based on the estimation, end users can strike a balance between the budget and the quality through the interface of this toolkit.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fionda:2017:EQK, author = "Valeria Fionda and Giuseppe Pirr{\`o}", title = "Explaining and querying knowledge graphs by relatedness", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1913--1916", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137807", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate RECAP, a tool that explains relatedness between entities in Knowledge Graphs (KGs) and implements a query by relatedness paradigm that allows to retrieve entities related to those in input. One of the peculiarities of RECAP is that it does not require any data preprocessing and can combine knowledge from multiple KGs. The underlying algorithmic techniques are reduced to the execution of SPARQL queries plus some local refinement. This makes the tool readily available on a large variety of KGs accessible via SPARQL endpoints. To show the general applicability of the tool, we will cover a set of use cases drawn from a variety of knowledge domains (e.g., biology, movies, co-authorship networks) and report on the concrete usage of RECAP in the SENSE4US FP7 project. We will underline the technical aspects of the system and give details on its implementation. The target audience of the demo includes both researchers and practitioners and aims at reporting on the benefits of RECAP in practical knowledge discovery applications.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kunjir:2017:TAM, author = "Mayuresh Kunjir and Shivnath Babu", title = "{Thoth} in action: memory management in modern data analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1917--1920", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137808", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Allocation and usage of memory in modern data-processing platforms is based on an interplay of algorithms at multiple levels: (i) at the resource-management level across containers allocated by resource managers like Mesos and Yarn, (ii) at the container level among the OS and processes such as the Java Virtual Machine (JVM), (iii) at the framework level for caching, aggregation, data shuffles, and application data structures, and (iv) at the JVM level across various pools such as the Young and Old Generation as well as the heap versus off-heap. We use Thoth, a data-driven platform for multi-system cluster management, to build a deep understanding of different interplays in memory management options. Through multiple memory management apps built in Thoth, we demonstrate how Thoth can deal with multiple levels of memory management as well as multi-tenant nature of clusters.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Schule:2017:MSS, author = "Maximilian E. Sch{\"u}le and Pascal M. N. Schliski and Thomas Hutzelmann and Tobias Rosenberger and Viktor Leis and Dimitri Vorona and Alfons Kemper and Thomas Neumann", title = "{Monopedia}: staying single is good enough --- the hyper way for web scale applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1921--1924", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137809", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In order to handle the database load for web scale applications, the conventional wisdom is that a cluster of database servers and a caching layer are essential. In this work, we argue that modern main memory database systems are often fast enough to consolidate this complex architecture into a single server (plus an additional fail over system). To demonstrate this claim, we design the Monopedia Benchmark, a benchmark for web scale applications modeled after Wikipedia. Using this benchmark, we show that it is indeed possible to run the database workload of one of the largest web sites in the world on a single database server.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sun:2017:DDM, author = "Ji Sun and Zeyuan Shang and Guoliang Li and Dong Deng and Zhifeng Bao", title = "{Dima}: a distributed in-memory similarity-based query processing system", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1925--1928", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137810", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data analysts in industries spend more than 80\% of time on data cleaning and integration in the whole process of data analytics due to data errors and inconsistencies. It calls for effective query processing techniques to tolerate the errors and inconsistencies. In this paper, we develop a distributed in-memory similarity-based query processing system called Dima. Dima supports two core similarity-based query operations, i.e., similarity search and similarity join. Dima extends the SQL programming interface for users to easily invoke these two operations in their data analysis jobs. To avoid expensive data transformation in a distributed environment, we design selectable signatures where two records approximately match if they share common signatures. More importantly, we can adaptively select the signatures to balance the workload. Dima builds signature-based global indexes and local indexes to support efficient similarity search and join. Since Spark is one of the widely adopted distributed in-memory computing systems, we have seamlessly integrated Dima into Spark and developed effective query optimization techniques in Spark. To the best of our knowledge, this is the first full-fledged distributed in-memory system that can support similarity-based query processing. We demonstrate our system in several scenarios, including entity matching, web table integration and query recommendation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chekol:2017:TTC, author = "Melisachew W. Chekol and Giuseppe Pirr{\`o} and Joerg Schoenfisch and Heiner Stuckenschmidt", title = "{TeCoRe}: temporal conflict resolution in knowledge graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1929--1932", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137811", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The management of uncertainty is crucial when harvesting structured content from unstructured and noisy sources. Knowledge Graphs (kgs), maintaining both numerical and non-numerical facts supported by an underlying schema, are a prominent example. Knowledge Graph management is challenging because: (i) most of existing kgs focus on static data, thus impeding the availability of timewise knowledge; (ii) facts in kgs are usually accompanied by a confidence score, which witnesses how likely it is for them to hold. We demonstrate TeCoRe, a system for temporal inference and conflict resolution in uncertain temporal knowledge graphs (utkgs). At the heart of TeCoRe are two state-of-the-art probabilistic reasoners that are able to deal with temporal constraints efficiently. While one is scalable, the other can cope with more expressive constraints. The demonstration will focus on enabling users and applications to find inconsistencies in utkgs. TeCoRe provides an interface allowing to select utkgs and editing constraints; shows the maximal consistent subset of the utkg, and displays statistics (e.g., number of noisy facts removed) about the debugging process.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2017:MTD, author = "Xupeng Li and Bin Cui and Yiru Chen and Wentao Wu and Ce Zhang", title = "{MLog}: towards declarative in-database machine learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1933--1936", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137812", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate MLog, a high-level language that integrates machine learning into data management systems. Unlike existing machine learning frameworks (e.g., TensorFlow, Theano, and Caffe), MLog is declarative, in the sense that the system manages all data movement, data persistency, and machine-learning related optimizations (such as data batching) automatically. Our interactive demonstration will show audience how this is achieved based on the novel notion of tensoral views (TViews), which are similar to relational views but operate over tensors with linear algebra. With MLog, users can succinctly specify not only simple models such as SVM (in just two lines), but also sophisticated deep learning models that are not supported by existing in-database analytics systems (e.g., MADlib, PAL, and SciDB), as a series of cascaded TViews. Given the declarative nature of MLog, we further demonstrate how query/program optimization techniques can be leveraged to translate MLog programs into native TensorFlow programs. The performance of the automatically generated Tensor-Flow programs is comparable to that of hand-optimized ones.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Demiralp:2017:FRV, author = "{\c{C}}agatay Demiralp and Peter J. Haas and Srinivasan Parthasarathy and Tejaswini Pedapati", title = "{Foresight}: recommending visual insights", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1937--1940", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137813", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Current tools for exploratory data analysis (EDA) require users to manually select data attributes, statistical computations and visual encodings. This can be daunting for large-scale, complex data. We introduce Foresight, a system that helps the user rapidly discover visual insights from large high-dimensional datasets. Formally, an ``insight'' is a strong manifestation of a statistical property of the data, e.g., high correlation between two attributes, high skewness or concentration about the mean of a single attribute, a strong clustering of values, and so on. For each insight type, Foresight initially presents visualizations of the top k instances in the data, based on an appropriate ranking metric. The user can then look at ``nearby'' insights by issuing ``insight queries'' containing constraints on insight strengths and data attributes. Thus the user can directly explore the space of insights, rather than the space of data dimensions and visual encodings as in other visual recommender systems. Foresight also provides ``global'' views of insight space to help orient the user and ensure a thorough exploration process. Furthermore, Foresight facilitates interactive exploration of large datasets through fast, approximate sketching.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jacobs:2017:BDT, author = "Steven Jacobs and Md Yusuf Sarwar Uddin and Michael Carey and Vagelis Hristidis and Vassilis J. Tsotras and N. Venkatasubramanian and Yao Wu and Syed Safir and Purvi Kaul and Xikui Wang and Mohiuddin Abdul Qader and Yawei Li", title = "A {BAD} demonstration: towards {Big Active Data}", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1941--1944", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137814", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Nearly all of today's Big Data systems are passive in nature. We demonstrate our Big Active Data (``BAD'') system, a scalable system that continuously and reliably captures Big Data and facilitates the timely and automatic delivery of new information to a large population of interested users as well as supporting analyses of historical information. We built our BAD project by extending an existing scalable, open-source BDMS (AsterixDB [1]) in this active direction. In this demonstration, we allow our audience to participate in an emergency notification application built on top of our BAD platform, and highlight its capabilities.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hassan:2017:CFE, author = "Naeemul Hassan and Gensheng Zhang and Fatma Arslan and Josue Caraballo and Damian Jimenez and Siddhant Gawsane and Shohedul Hasan and Minumol Joseph and Aaditya Kulkarni and Anil Kumar Nayak and Vikas Sable and Chengkai Li and Mark Tremayne", title = "{ClaimBuster}: the first-ever end-to-end fact-checking system", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1945--1948", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137815", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Our society is struggling with an unprecedented amount of falsehoods, hyperboles, and half-truths. Politicians and organizations repeatedly make the same false claims. Fake news floods the cyberspace and even allegedly influenced the 2016 election. In fighting false information, the number of active fact-checking organizations has grown from 44 in 2014 to 114 in early 2017.$^1$ Fact-checkers vet claims by investigating relevant data and documents and publish their verdicts. For instance, PolitiFact.com, one of the earliest and most popular fact-checking projects, gives factual claims truthfulness ratings such as True, Mostly True, Half true, Mostly False, False, and even ``Pants on Fire''. In the U.S., the election year made fact-checking a part of household terminology. For example, during the first presidential debate on September 26, 2016, NPR.org's live fact-checking website drew 7.4 million page views and delivered its biggest traffic day ever.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deep:2017:QDR, author = "Shaleen Deep and Paraschos Koutris and Yash Bidasaria", title = "{QIRANA} demonstration: real time scalable query pricing", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1949--1952", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137816", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The last decade has seen a deluge in data collection and dissemination across a broad range of areas. This phenomena has led to creation of online data markets where entities engage in sale and purchase of data. In this scenario, the key challenge for the data market platform is to ensure that it allows real time, scalable, arbitrage-free pricing of user queries. At the same time, the platform needs to flexible enough for sellers in order to customize the setup of the data to be sold. In this paper, we describe the demonstration of Q irana, a light weight framework that implements query-based pricing at scale. The framework acts as a layer between the end users (buyers and sellers) and the database. Qirana's demonstration features that we highlight are: (i) allows sellers to choose from a variety of pricing functions based on their requirements and incorporates price points as a guide for query pricing; (ii) helps the seller set parameters by mocking workloads; (iii) buyers engage with the platform by directly asking queries and track their budget per dataset;. We demonstrate the tunable parameters of our framework over a real-world dataset, illustrating the promise of our approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Khan:2017:DDT, author = "Meraj Khan and Larry Xu and Arnab Nandi and Joseph M. Hellerstein", title = "{DataTweener}: a demonstration of a tweening engine for incremental visualization of data transforms", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1953--1956", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137817", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the development and advancement of new data interaction modalities, data exploration and analysis has become a highly interactive process situating the user in a session of successive queries. With rapidly changing results, it becomes difficult for the end user to fully comprehend transformations, especially the transforms corresponding to complex queries. We introduce ``data tweening'' as an informative way of visualizing structural data transforms, presenting the users with a series of incremental visual representations of a resultset transformation. We present transformations as ordered sequences of basic structural transforms and visual cues. The sequences are generated using an automated framework which utilizes differences between the consecutive resultsets and queries in a query session. We evaluate the effectiveness of tweening as a visualization method through a user study.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Salimi:2017:ZCI, author = "Babak Salimi and Corey Cole and Dan R. K. Ports and Dan Suciu", title = "{ZaliQL}: causal inference from observational data at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1957--1960", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137818", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Causal inference from observational data is a subject of active research and development in statistics and computer science. Many statistical software packages have been developed for this purpose. However, these toolkits do not scale to large datasets. We propose and demonstrate ZaliQL: a SQL-based framework for drawing causal inference from observational data. ZaliQL supports the state-of-the-art methods for causal inference and runs at scale within PostgreSQL database system. In addition, we built a visual interface to wrap around ZaliQL. In our demonstration, we will use this GUI to show a live investigation of the causal effect of different weather conditions on flight delays.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Alarabi:2017:DSH, author = "Louai Alarabi and Mohamed F. Mokbel", title = "A demonstration of {ST-Hadoop}: a {MapReduce} framework for big spatio-temporal data", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1961--1964", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137819", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This demo presents ST-Hadoop; the first full-fledged open-source MapReduce framework with a native support for spatio-temporal data. ST-Hadoop injects spatio-temporal awareness in the Hadoop base code, which results in achieving order(s) of magnitude better performance than Hadoop and SpatialHadoop when dealing with spatio-temporal data and queries. The key idea behind ST-Hadoop is its ability in indexing spatio-temporal data within Hadoop Distributed File System (HDFS). A real system prototype of ST-Hadoop, running on a local cluster of 24 machines, is demonstrated with two big-spatio-temporal datasets of Twitter and NYC Taxi data, each of around one billion records.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bharadwaj:2017:CIL, author = "S. Bharadwaj and L. Chiticariu and M. Danilevsky and S. Dhingra and S. Divekar and A. Carreno-Fuentes and H. Gupta and N. Gupta and S.-D. Han and M. Hern{\'a}ndez and H. Ho and P. Jain and S. Joshi and H. Karanam and S. Krishnan and R. Krishnamurthy and Y. Li and S. Manivannan and A. Mittal and F. {\"O}zcan and A. Quamar and P. Raman and D. Saha and K. Sankaranarayanan and J. Sen and P. Sen and S. Vaithyanathan and M. Vasa and H. Wang and H. Zhu", title = "Creation and interaction with large-scale domain-specific knowledge bases", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1965--1968", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137820", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The ability to create and interact with large-scale domain-specific knowledge bases from unstructured/semi-structured data is the foundation for many industry-focused cognitive systems. We will demonstrate the Content Services system that provides cloud services for creating and querying high-quality domain-specific knowledge bases by analyzing and integrating multiple (un/semi)structured content sources. We will showcase an instantiation of the system for a financial domain. We will also demonstrate both cross-lingual natural language queries and programmatic API calls for interacting with this knowledge base.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jonathan:2017:DSC, author = "Christopher Jonathan and Mohamed F. Mokbel", title = "A demonstration of {Stella}: a crowdsourcing-based geotagging framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1969--1972", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137821", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper demonstrates Stella; an efficient crowdsourcing-based geotagging framework for any types of objects. In this demonstration, we showcase the effectiveness of Stella in geotagging images via two different scenarios: (1) we provide a graphical interface to show the process of a geotagging process that have been done by using Amazon Mechanical Turk, (2) we seek help from the conference attendees to propose an image to be geotagged or to help us geotag an image by using our application during the demonstration period. At the end of the demonstration period, we will show the geotagging result.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Moll:2017:EBV, author = "Oscar Moll and Aaron Zalewski and Sudeep Pillai and Sam Madden and Michael Stonebraker and Vijay Gadepally", title = "Exploring big volume sensor data with {Vroom}", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1973--1976", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137822", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "State of the art sensors within a single autonomous vehicle (AV) can produce video and LIDAR data at rates greater than 30 GB/hour. Unsurprisingly, even small AV research teams can accumulate tens of terabytes of sensor data from multiple trips and multiple vehicles. AV practitioners would like to extract information about specific locations or specific situations for further study, but are often unable to. Queries over AV sensor data are different from generic analytics or spatial queries because they demand reasoning about fields of view as well as heavy computation to extract features from scenes. In this article and demo we present Vroom, a system for ad-hoc queries over AV sensor databases. Vroom combines domain specific properties of AV datasets with selective indexing and multi-query optimization to address challenges posed by AV sensor data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mottin:2017:NTE, author = "Davide Mottin and Matteo Lissandrini and Yannis Velegrakis and Themis Palpanas", title = "New trends on exploratory methods for data analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1977--1980", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137824", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data usually comes in a plethora of formats and dimensions, rendering the exploration and information extraction processes cumbersome. Thus, being able to cast exploratory queries in the data with the intent of having an immediate glimpse on some of the data properties is becoming crucial. An exploratory query should be simple enough to avoid complicate declarative languages (such as SQL) and mechanisms, and at the same time retain the flexibility and expressiveness of such languages. Recently, we have witnessed a rediscovery of the so called example-based methods, in which the user, or the analyst circumvent query languages by using examples as input. An example is a representative of the intended results, or in other words, an item from the result set. Example-based methods exploit inherent characteristics of the data to infer the results that the user has in mind, but may not able to (easily) express. They can be useful both in cases where a user is looking for information in an unfamiliar dataset, or simply when she is exploring the data without knowing what to find in there. In this tutorial, we present an excursus over the main methods for exploratory analysis, with a particular focus on example-based methods. We show how different data types require different techniques, and present algorithms that are specifically designed for relational, textual, and graph data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Khan:2017:SSD, author = "Arijit Khan and Sourav S. Bhowmick and Francesco Bonchi", title = "Summarizing static and dynamic big graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1981--1984", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137825", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large-scale, highly-interconnected networks pervade our society and the natural world around us, including the World Wide Web, social networks, knowledge graphs, genome and scientific databases, medical and government records. The massive scale of graph data often surpasses the available computation and storage resources. Besides, users get overwhelmed by the daunting task of understanding and using such graphs due to their sheer volume and complexity. Hence, there is a critical need to summarize large graphs into concise forms that can be more easily visualized, processed, and managed. Graph summarization has indeed attracted a lot of interests from various research communities, such as sociology, physics, chemistry, bioinformatics, and computer science. Different ways of summarizing graphs have been invented that are often complementary to each other. In this tutorial, we discuss algorithmic advances on graph summarization in the context of both classical (e.g., static graphs) and emerging (e.g., dynamic and stream graphs) applications. We emphasize the current challenges and highlight some future research directions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mouratidis:2017:GAT, author = "Kyriakos Mouratidis", title = "Geometric approaches for top-$k$ queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1985--1987", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137826", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Top- k processing is a well-studied problem with numerous applications that is becoming increasingly relevant with the growing availability of recommendation systems and decision making software. The objective of this tutorial is twofold. First, we will delve into the geometric aspects of top- k processing. Second, we will cover complementary features to top- k queries, with strong practical relevance and important applications, that have a computational geometric nature. The tutorial will close with insights in the effect of dimensionality on the meaningfulness of top- k queries, and interesting similarities to nearest neighbor search.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tong:2017:SCC, author = "Yongxin Tong and Lei Chen and Cyrus Shahabi", title = "Spatial crowdsourcing: challenges, techniques, and applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1988--1991", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137827", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Crowdsourcing is a new computing paradigm where humans are actively enrolled to participate in the procedure of computing, especially for tasks that are intrinsically easier for humans than for computers. The popularity of mobile computing and sharing economy has extended conventional web-based crowdsourcing to spatial crowdsourcing (SC), where spatial data such as location, mobility and the associated contextual information, plays a central role. In fact, spatial crowdsourcing has stimulated a series of recent industrial successes including Citizen Sensing (Waze), P2P ride-sharing (Uber) and Real-time Online-To-Offline (O2O) services (Instacart and Postmates). In this tutorial, we review the paradigm shift from web-based crowdsourcing to spatial crowdsourcing. We dive deep into the challenges and techniques brought by the unique spatio-temporal characteristics of spatial crowdsourcing. Particularly, we survey new designs in task assignment, quality control, incentive mechanism design and privacy protection on spatial crowdsourcing platforms, as well as the new trend to incorporate crowdsourcing to enhance existing spatial data processing techniques. We also discuss case studies of representative spatial crowdsourcing systems and raise open questions and current challenges for the audience to easily comprehend the tutorial and to advance this important research area.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Eldawy:2017:EBS, author = "Ahmed Eldawy and Mohamed F. Mokbel", title = "The era of big spatial data", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1992--1995", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137828", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this tutorial, we present the recent work in the database community for handling Big Spatial Data. This topic became very hot due to the recent explosion in the amount of spatial data generated by smart phones, satellites and medical devices, among others. This tutorial goes beyond the use of existing systems as-is (e.g., Hadoop, Spark or Impala), and digs deep into the core components of big systems (e.g., indexing and query processing) to describe how they are designed to handle big spatial data. During this 90-minute tutorial, we review the state-of-the-art work in the area of Big Spatial Data while classifying the existing research efforts according to the implementation approach, underlying architecture, and system components. In addition, we provide case studies of full-fledged systems and applications that handle Big Spatial Data which allows the audience to better comprehend the whole tutorial.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Giatrakos:2017:CER, author = "Nikos Giatrakos and Alexander Artikis and Antonios Deligiannakis and Minos Garofalakis", title = "Complex event recognition in the big data era", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1996--1999", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137829", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The concept of event processing is established as a generic computational paradigm in various application fields, ranging from data processing in Web environments, over maritime and transport, to finance and medicine. Events report on state changes of a system and its environment. Complex Event Recognition (CER) in turn, refers to the identification of complex/composite events of interest, which are collections of simple events that satisfy some pattern, thereby providing the opportunity for reactive and proactive measures. Examples include the recognition of attacks in computer network nodes, human activities on video content, emerging stories and trends on the Social Web, traffic and transport incidents in smart cities, fraud in electronic marketplaces, cardiac arrhythmias, and epidemic spread. In each scenario, CER allows to make sense of Big event Data streams and react accordingly. The goal of this tutorial is to provide a step-by-step guide for realizing CER in the Big Data era. To do so, it elaborates on major challenges and describes algorithmic toolkits for optimized manipulation of event streams characterized by high volume, velocity and/or lack of veracity, placing emphasis on distributed CER over potentially heterogeneous (data variety) event sources. Finally, we highlight future research directions in the field.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mohan:2017:TBD, author = "C. Mohan", title = "{Tutorial}: blockchains and databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "2000--2001", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137830", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the last few years, blockchain (also known as distributed ledger), the underlying technology of the permissionless or public Bitcoin network, has become very popular for use in private or permissioned environments. Computer companies like IBM and Microsoft, and many key players in different vertical industry segments have recognized the utility of blockchains for securely managing assets (physical/digital) other than cryptocurrencies. IBM did some pioneering work by architecting and implementing a private blockchain system, and then open sourcing it. That system, which has since then been named Fabric, is being enhanced via the Hyperledger Consortium set up under the auspices of the Linux Foundation. Other efforts in the industry include Enterprise Ethereum, R3 Corda and BigchainDB.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zakhary:2017:CWS, author = "Victor Zakhary and Divyakant Agrawal and Amr {El Abbadi}", title = "Caching at the web scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "2002--2005", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137831", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Today's web applications and social networks are serving billions of users around the globe. These users generate billions of key lookups and millions of data object updates per second. A single user's social network page load requires hundreds of key lookups. This scale creates many design challenges for the underlying storage systems. First, these systems have to serve user requests with low latency. Any increase in the request latency leads to a decrease in user interest. Second, storage systems have to be highly available. Failures should be handled seamlessly without affecting user requests. Third, users consume an order of magnitude more data than they produce. Therefore, storage systems have to be optimized for read-intensive workloads. To address these challenges, distributed in-memory caching services have been widely deployed on top of persistent storage. In this tutorial, we survey the recent developments in distributed caching services. We present the algorithmic and architectural efforts behind these systems focusing on the challenges in addition to open research questions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2017:HLD, author = "Guoliang Li", title = "Human-in-the-loop data integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "2006--2017", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137833", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data integration aims to integrate data in different sources and provide users with a unified view. However, data integration cannot be completely addressed by purely automated methods. We propose a hybrid human-machine data integration framework that harnesses human ability to address this problem, and apply it initially to the problem of entity matching. The framework first uses rule-based algorithms to identify possible matching pairs and then utilizes the crowd to refine these candidate pairs in order to compute actual matching pairs. In the first step, we propose similarity-based rules and knowledge-based rules to obtain some candidate matching pairs, and develop effective algorithms to learn these rules based on some given positive and negative examples. We build a distributed in-memory system DIMA to efficiently apply these rules. In the second step, we propose a selection-inference-refine framework that uses the crowd to verify the candidate pairs. We first select some ``beneficial'' tasks to ask the crowd and then use transitivity and partial order to infer the answers of unasked tasks based on the crowdsourcing results of the asked tasks. Next we refine the inferred answers with high uncertainty due to the disagreement from the crowd. We develop a crowd-powered database system CDB and deploy it on real crowdsourcing platforms. CDB allows users to utilize a SQL-like language for processing crowd-based queries. Lastly, we provide emerging challenges in human-in-the-loop data integration.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lehner:2017:DCU, author = "Wolfgang Lehner", title = "The data center under your desk: how disruptive is modern hardware for {DB} system design?", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "2018--2019", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137834", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "While we are already used to see more than 1,000 cores within a single machine, the next processing platforms for database engines will be heterogeneous with built-in GPU-style processors as well as specialized FPGAs or chips with domain-specific instruction sets. Moreover, the traditional volatile as well as the upcoming non-volatile RAM with capacities in the 100s of TBytes per machine will provide great opportunities for storage engines but also call for radical changes on the architecture of such systems. Finally, the emergence of economically affordable, high-speed/low-latency interconnects as a basis for rack-scale computing is questioning long-standing folklore algorithmic assumptions but will certainly play an important role in the big picture of building modern data management platforms. In this talk, we will try to classify and review existing approaches from a performance, robustness, as well as energy efficiency perspective and pinpoint interesting starting points for further research activities.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Milo:2017:SMM, author = "Tova Milo", title = "7 secrets that my mother didn't tell me", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "2020--2020", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137835", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "What does it take to be a good researcher? And, is it different when you are a women? These are questions that many of us are wondering about throughout our career. Being honored with a VLDB Women in Database Research Award, I would like to share with you in this talk some of the secrets to successful research that I have learned over the years. These secrets highlight some of the fundamental research directions that I have taken. No less importantly, they explain how I successfully got to work on them, both personally and professionally.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lv:2017:IPL, author = "Qin Lv and William Josephson and Zhe Wang and Moses Charikar and Kai Li", title = "Intelligent probing for locality sensitive hashing: multi-probe {LSH} and beyond", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "2021--2024", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137836", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The past decade has been marked by the (continued) explosion of diverse data content and the fast development of intelligent data analytics techniques. One problem we identified in the mid-2000s was similarity search of feature-rich data. The challenge here was achieving both high accuracy and high efficiency in high-dimensional spaces. Locality sensitive hashing (LSH), which uses certain random space partitions and hash table lookups to find approximate nearest neighbors, was a promising approach with theoretical guarantees. But LSH alone was insufficient since a large number of hash tables were required to achieve good search quality. Building on an idea of Panigrahy, our multi-probe LSH method introduced the idea of intelligent probing. Given a query object, we strategically probe its neighboring hash buckets (in a query-dependent fashion) by calculating the statistical probabilities of similar objects falling into each bucket. Such intelligent probing can significantly reduce the number of hash tables while achieving high quality. In this paper, we revisit the problem motivation, the challenges, the key design considerations of multi-probe LSH, as well as discuss recent developments in this space and some questions for further research.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qin:2017:SRB, author = "Dai Qin and Angela Demke Brown and Ashvin Goel", title = "Scalable replay-based replication for fast databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "13", pages = "2025--2036", month = sep, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:20 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Primary-backup replication is commonly used for providing fault tolerance in databases. It is performed by replaying the database recovery log on a backup server. Such a scheme raises several challenges for modern, high-throughput multi-core databases. It is hard to replay the recovery log concurrently, and so the backup can become the bottleneck. Moreover, with the high transaction rates on the primary, the log transfer can cause network bottlenecks. Both these bottlenecks can significantly slow the primary database. In this paper, we propose using record-replay for replicating fast databases. Our design enables replay to be performed scalably and concurrently, so that the backup performance scales with the primary performance. At the same time, our approach requires only 15--20\% of the network bandwidth required by traditional logging, reducing network infrastructure costs significantly.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ren:2017:SSE, author = "Kai Ren and Qing Zheng and Joy Arulraj and Garth Gibson", title = "{SlimDB}: a space-efficient key--value storage engine for semi-sorted data", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "13", pages = "2037--2048", month = sep, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:20 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern key--value stores often use write-optimized indexes and compact in-memory indexes to speed up read and write performance. One popular write-optimized index is the Log-structured merge-tree (LSM-tree) which provides indexed access to write-intensive data. It has been increasingly used as a storage backbone for many services, including file system metadata management, graph processing engines, and machine learning feature storage engines. Existing LSM-tree implementations often exhibit high write amplifications caused by compaction, and lack optimizations to maximize read performance on solid-state disks. The goal of this paper is to explore techniques that leverage common workload characteristics shared by many systems using key--value stores to reduce the read/write amplification overhead typically associated with general-purpose LSM-tree implementations. Our experiments show that by applying these design techniques, our new implementation of a key--value store, SlimDB, can be two to three times faster, use less memory to cache metadata indices, and show lower tail latency in read operations compared to popular LSM-tree implementations such as LevelDB and RocksDB.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abdelaziz:2017:SEC, author = "Ibrahim Abdelaziz and Razen Harbi and Zuhair Khayyat and Panos Kalnis", title = "A survey and experimental comparison of distributed {SPARQL} engines for very large {RDF} data", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "13", pages = "2049--2060", month = sep, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:20 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed SPARQL engines promise to support very large RDF datasets by utilizing shared-nothing computer clusters. Some are based on distributed frameworks such as MapReduce; others implement proprietary distributed processing; and some rely on expensive preprocessing for data partitioning. These systems exhibit a variety of trade-offs that are not well-understood, due to the lack of any comprehensive quantitative and qualitative evaluation. In this paper, we present a survey of 22 state-of-the-art systems that cover the entire spectrum of distributed RDF data processing and categorize them by several characteristics. Then, we select 12 representative systems and perform extensive experimental evaluation with respect to preprocessing cost, query performance, scalability and workload adaptability, using a variety of synthetic and real large datasets with up to 4.3 billion triples. Our results provide valuable insights for practitioners to understand the trade-offs for their usage scenarios. Finally, we publish online our evaluation framework, including all datasets and workloads, for researchers to compare their novel systems against the existing ones.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kunft:2017:BEM, author = "Andreas Kunft and Asterios Katsifodimos and Sebastian Schelter and Tilmann Rabl and Volker Markl", title = "{Blockjoin}: efficient matrix partitioning through joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "13", pages = "2061--2072", month = sep, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:20 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Linear algebra operations are at the core of many Machine Learning (ML) programs. At the same time, a considerable amount of the effort for solving data analytics problems is spent in data preparation. As a result, end-to-end ML pipelines often consist of (i) relational operators used for joining the input data, ( ii) user defined functions used for feature extraction and vectorization, and (iii) linear algebra operators used for model training and cross-validation. Often, these pipelines need to scale out to large datasets. In this case, these pipelines are usually implemented on top of dataflow engines like Hadoop, Spark, or Flink. These dataflow engines implement relational operators on row-partitioned datasets. However, efficient linear algebra operators use block-partitioned matrices. As a result, pipelines combining both kinds of operators require rather expensive changes to the physical representation, in particular re-partitioning steps. In this paper, we investigate the potential of reducing shuffling costs by fusing relational and linear algebra operations into specialized physical operators. We present BlockJoin, a distributed join algorithm which directly produces block-partitioned results. To minimize shuffling costs, BlockJoin applies database techniques known from columnar processing, such as index-joins and late materialization, in the context of parallel dataflow engines. Our experimental evaluation shows speedups up to 6$ \times $ and the skew resistance of BlockJoin compared to state-of-the-art pipelines implemented in Spark.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Choi:2017:EMR, author = "Dong-Wan Choi and Jian Pei and Thomas Heinis", title = "Efficient mining of regional movement patterns in semantic trajectories", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "13", pages = "2073--2084", month = sep, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:20 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Semantic trajectory pattern mining is becoming more and more important with the rapidly growing volumes of semantically rich trajectory data. Extracting sequential patterns in semantic trajectories plays a key role in understanding semantic behaviour of human movement, which can widely be used in many applications such as location-based advertising, road capacity optimisation, and urban planning. However, most of existing works on semantic trajectory pattern mining focus on the entire spatial area, leading to missing some locally significant patterns within a region. Based on this motivation, this paper studies a regional semantic trajectory pattern mining problem, aiming at identifying all the regional sequential patterns in semantic trajectories. Specifically, we propose a new density scheme to quantify the frequency of a particular pattern in space, and thereby formulate a new mining problem of finding all the regions in which such a pattern densely occurs. For the proposed problem, we develop an efficient mining algorithm, called RegMiner (Regional Semantic Trajectory Pattern Miner), which effectively reveals movement patterns that are locally frequent in such a region but not necessarily dominant in the entire space. Our empirical study using real trajectory data shows that RegMiner finds many interesting local patterns that are hard to find by a state-of-the-art global pattern mining scheme, and it also runs several orders of magnitude faster than the global pattern mining algorithm.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kiefer:2017:EJS, author = "Martin Kiefer and Max Heimel and Sebastian Bre{\ss} and Volker Markl", title = "Estimating join selectivities using bandwidth-optimized kernel density models", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "13", pages = "2085--2096", month = sep, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:20 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Accurately predicting the cardinality of intermediate plan operations is an essential part of any modern relational query optimizer. The accuracy of said estimates has a strong and direct impact on the quality of the generated plans, and incorrect estimates can have a negative impact on query performance. One of the biggest challenges in this field is to predict the result size of join operations. Kernel Density Estimation (KDE) is a statistical method to estimate multivariate probability distributions from a data sample. Previously, we introduced a modern, self-tuning selectivity estimator for range scans based on KDE that out-performs state-of-the-art multidimensional histograms and is efficient to evaluate on graphics cards. In this paper, we extend these bandwidth-optimized KDE models to estimate the result size of single and multiple joins. In particular, we propose two approaches: (1) Building a KDE model from a sample drawn from the join result. (2) Efficiently combining the information from base table KDE models. We evaluated our KDE-based join estimators on a variety of synthetic and real-world datasets, demonstrating that they are superior to state-of-the art join estimators based on sketching or sampling.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Menon:2017:ROF, author = "Prashanth Menon and Todd C. Mowry and Andrew Pavlo", title = "Relaxed operator fusion for in-memory databases: making compilation, vectorization, and prefetching work together at last", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "1", pages = "1--13", month = sep, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:21 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In-memory database management systems (DBMSs) are a key component of modern on-line analytic processing (OLAP) applications, since they provide low-latency access to large volumes of data. Because disk accesses are no longer the principle bottleneck in such systems, the focus in designing query execution engines has shifted to optimizing CPU performance. Recent systems have revived an older technique of using just-in-time (JIT) compilation to execute queries as native code instead of interpreting a plan. The state-of-the-art in query compilation is to fuse operators together in a query plan to minimize materialization overhead by passing tuples efficiently between operators. Our empirical analysis shows, however, that more tactful materialization yields better performance. We present a query processing model called ``relaxed operator fusion'' that allows the DBMS to introduce staging points in the query plan where intermediate results are temporarily materialized. This allows the DBMS to take advantage of inter-tuple parallelism inherent in the plan using a combination of prefetching and SIMD vectorization to support faster query execution on data sets that exceed the size of CPU-level caches. Our evaluation shows that our approach reduces the execution time of OLAP queries by up to 2.2$ \times $ and achieves up to 1.8$ \times $ better performance compared to other in-memory DBMSs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2017:PSS, author = "Yu Liu and Bolong Zheng and Xiaodong He and Zhewei Wei and Xiaokui Xiao and Kai Zheng and Jiaheng Lu", title = "{Probesim}: scalable single-source and top-$k$ {SimRank} computations on dynamic graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "1", pages = "14--26", month = sep, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:21 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Single-source and top- k SimRank queries are two important types of similarity search in graphs with numerous applications in web mining, social network analysis, spam detection, etc. A plethora of techniques have been proposed for these two types of queries, but very few can efficiently support similarity search over large dynamic graphs, due to either significant preprocessing time or large space overheads. This paper presents ProbeSim, an index-free algorithm for single-source and top- k SimRank queries that provides a non-trivial theoretical guarantee in the absolute error of query results. ProbeSim estimates SimRank similarities without precomputing any indexing structures, and thus can naturally support real-time SimRank queries on dynamic graphs. Besides the theoretical guarantee, ProbeSim also offers satisfying practical efficiency and effectiveness due to non-trivial optimizations. We conduct extensive experiments on a number of benchmark datasets, which demonstrate that our solutions outperform the existing methods in terms of efficiency and effectiveness. Notably, our experiments include the first empirical study that evaluates the effectiveness of SimRank algorithms on graphs with billion edges, using the idea of pooling.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Guagliardo:2017:FSS, author = "Paolo Guagliardo and Leonid Libkin", title = "A formal semantics of {SQL} queries, its validation, and applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "1", pages = "27--39", month = sep, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:21 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "While formal semantics of theoretical languages underlying SQL have been provided in the past, they all made simplifying assumptions ranging from changes in the syntax to omitting bag semantics and nulls. This situation is reminiscent of what happens in the field of programming languages, where semantics of formal calculi underlying the main features of languages are abundant, but formal semantics of real languages that people use are few and far between. We consider the basic class of SQL queries --- essentially SELECT-FROM-WHERE queries with subqueries, set/bag operations, and nulls --- and define a formal semantics for it, without any departures from the real language. This fragment already requires decisions related to the data model and handling variable names that are normally disregarded by simplified semantics. To justify our choice of the semantics, we validate it experimentally on a large number of randomly generated queries and databases. We give two applications of the semantics. One is the first formal proof of the equivalence of basic SQL and relational algebra that extends to bag semantics and nulls. The other application looks at the three-valued logic employed by SQL, which is universally assumed to be necessary to handle nulls. We prove however that this is not so, as three-valued logic does not add expressive power: every SQL query in our fragment can be evaluated under the usual two-valued Boolean semantics of conditions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kim:2017:EHS, author = "Jinhyun Kim and Jun-Ki Min and Kyuseok Shim", title = "Efficient {Haar$^+$} synopsis construction for the maximum absolute error measure", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "1", pages = "40--52", month = sep, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:21 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Several wavelet synopsis construction algorithms were previously proposed based on dynamic programming for unrestricted Haar wavelet synopses as well as Haar$^+$ synopses. However, they find an optimal synopsis for every incoming value in each node of a coefficient tree, even if different incoming values share an identical optimal synopsis. To alleviate the limitation, we present novel algorithms, which keep only a minimal set of the distinct optimal synopses in each node of the tree, for the error-bounded synopsis problem. Furthermore, we propose the methods to restrict coefficient values to be considered to compute the optimal synopses in each node. In addition, by partitioning all optimal synopses in each node into a set of groups, such that every group can be represented by a compact representation, we significantly improve the performance of the proposed algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tao:2017:ASJ, author = "Wenbo Tao and Dong Deng and Michael Stonebraker", title = "Approximate string joins with abbreviations", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "1", pages = "53--65", month = sep, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:21 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "String joins have wide applications in data integration and cleaning. The inconsistency of data caused by data errors, term variations and missing values has led to the need for approximate string joins (ASJ). In this paper, we study ASJ with abbreviations, which are a frequent type of term variation. Although prior works have studied ASJ given a user-inputted dictionary of synonym rules, they have three common limitations. First, they suffer from low precision in the presence of abbreviations having multiple full forms. Second, their join algorithms are not scalable due to the exponential time complexity. Third, the dictionary may not exist since abbreviations are highly domain-dependent. We propose an end-to-end workflow to address these limitations. There are three main components in the workflow: (1) a new similarity measure taking abbreviations into account that can handle abbreviations having multiple full forms, (2) an efficient join algorithm following the filter-verification framework and (3) an unsupervised approach to learn a dictionary of abbreviation rules from input strings. We evaluate our workflow on four real-world datasets and show that our workflow outputs accurate join results, scales well as input size grows and greatly outperforms state-of-the-art approaches in both accuracy and efficiency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nguyen:2017:QDF, author = "Dat Ba Nguyen and Abdalghani Abujabal and Nam Khanh Tran and Martin Theobald and Gerhard Weikum", title = "Query-driven on-the-fly knowledge base construction", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "1", pages = "66--79", month = sep, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:21 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Today's openly available knowledge bases, such as DBpedia, Yago, Wikidata or Freebase, capture billions of facts about the world's entities. However, even the largest among these (i) are still limited in up-to-date coverage of what happens in the real world, and (ii) miss out on many relevant predicates that precisely capture the wide variety of relationships among entities. To overcome both of these limitations, we propose a novel approach to build on-the-fly knowledge bases in a query-driven manner. Our system, called QKBfly, supports analysts and journalists as well as question answering on emerging topics, by dynamically acquiring relevant facts as timely and comprehensively as possible. QKBfly is based on a semantic-graph representation of sentences, by which we perform three key IE tasks, namely named-entity disambiguation, co-reference resolution and relation extraction, in a light-weight and integrated manner. In contrast to Open IE, our output is canonicalized. In contrast to traditional IE, we capture more predicates, including ternary and higher-arity ones. Our experiments demonstrate that QKBfly can build high-quality, on-the-fly knowledge bases that can readily be deployed, e.g., for the task of ad-hoc question answering.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Poppe:2017:GGB, author = "Olga Poppe and Chuan Lei and Elke A. Rundensteiner and David Maier", title = "{GRETA}: graph-based real-time event trend aggregation", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "1", pages = "80--92", month = sep, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:21 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Streaming applications from algorithmic trading to traffic management deploy Kleene patterns to detect and aggregate arbitrarily-long event sequences, called event trends. State-of-the-art systems process such queries in two steps. Namely, they first construct all trends and then aggregate them. Due to the exponential costs of trend construction, this two-step approach suffers from both a long delays and high memory costs. To overcome these limitations, we propose the Graph-based Real-time Event Trend Aggregation (GRETA) approach that dynamically computes event trend aggregation without first constructing these trends. We define the GRETA graph to compactly encode all trends. Our GRETA runtime incrementally maintains the graph, while dynamically propagating aggregates along its edges. Based on the graph, the final aggregate is incrementally updated and instantaneously returned at the end of each query window. Our GRETA runtime represents a win-win solution, reducing both the time complexity from exponential to quadratic and the space complexity from exponential to linear in the number of events. Our experiments demonstrate that GRETA achieves up to four orders of magnitude speed-up and up to 50--fold memory reduction compared to the state-of-the-art two-step approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Guo:2017:PPP, author = "Wentian Guo and Yuchen Li and Mo Sha and Kian-Lee Tan", title = "Parallel {Personalized PageRank} on dynamic graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "1", pages = "93--106", month = sep, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:21 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/pagerank.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Personalized PageRank (PPR) is a well-known proximity measure in graphs. To meet the need for dynamic PPR maintenance, recent works have proposed a local update scheme to support incremental computation. Nevertheless, sequential execution of the scheme is still too slow for highspeed stream processing. Therefore, we are motivated to design a parallel approach for dynamic PPR computation. First, as updates always come in batches, we devise a batch processing method to reduce synchronization cost among every single update and enable more parallelism for iterative parallel execution. Our theoretical analysis shows that the parallel approach has the same asymptotic complexity as the sequential approach. Second, we devise novel optimization techniques to effectively reduce runtime overheads for parallel processes. Experimental evaluation shows that our parallel algorithm can achieve orders of magnitude speedups on GPUs and multi-core CPUs compared with the state-of-the-art sequential algorithm.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sha:2017:ADG, author = "Mo Sha and Yuchen Li and Bingsheng He and Kian-Lee Tan", title = "Accelerating dynamic graph analytics on {GPUs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "1", pages = "107--120", month = sep, year = "2017", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Oct 10 17:16:21 MDT 2017", bibsource = "http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As graph analytics often involves compute-intensive operations, GPUs have been extensively used to accelerate the processing. However, in many applications such as social networks, cyber security, and fraud detection, their representative graphs evolve frequently and one has to perform a rebuild of the graph structure on GPUs to incorporate the updates. Hence, rebuilding the graphs becomes the bottleneck of processing high-speed graph streams. In this paper, we propose a GPU-based dynamic graph storage scheme to support existing graph algorithms easily. Furthermore, we propose parallel update algorithms to support efficient stream updates so that the maintained graph is immediately available for high-speed analytic processing on GPUs. Our extensive experiments with three streaming applications on large-scale real and synthetic datasets demonstrate the superior performance of our proposed approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Appuswamy:2017:AIS, author = "Raja Appuswamy and Angelos C. Anadiotis and Danica Porobic and Mustafa K. Iman and Anastasia Ailamaki", title = "Analyzing the impact of system architecture on the scalability of {OLTP} engines for high-contention workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "2", pages = "121--134", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3149193.3149194", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 30 06:16:03 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Main-memory OLTP engines are being increasingly deployed on multicore servers that provide abundant thread-level parallelism. However, recent research has shown that even the state-of-the-art OLTP engines are unable to exploit available parallelism for high contention workloads. While previous studies have shown the lack of scalability of all popular concurrency control protocols, they consider only one system architecture---a non-partitioned, shared everything one where transactions can be scheduled to run on any core and can access any data or metadata stored in shared memory. In this paper, we perform a thorough analysis of the impact of other architectural alternatives (Data-oriented transaction execution, Partitioned Serial Execution, and Delegation) on scalability under high contention scenarios. In doing so, we present Trireme, a main-memory OLTP engine testbed that implements four system architectures and several popular concurrency control protocols in a single code base. Using Trireme, we present an extensive experimental study to understand (i) the impact of each system architecture on overall scalability, (ii) the interaction between system architecture and concurrency control protocols, and (iii) the pros and cons of new architectures that have been proposed recently to explicitly deal with high-contention workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jung:2017:SDL, author = "Hyungsoo Jung and Hyuck Han and Sooyong Kang", title = "Scalable database logging for multicores", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "2", pages = "135--148", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3149193.3149195", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 30 06:16:03 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern databases, guaranteeing atomicity and durability, store transaction logs in a volatile, central log buffer and then flush the log buffer to non-volatile storage by the write-ahead logging principle. Buffering logs in central log store has recently faced a severe multicore scalability problem, and log flushing has been challenged by synchronous I/O delay. We have designed and implemented a fast and scalable logging method, E leda, that can migrate a surge of transaction logs from volatile memory to stable storage without risking durable transaction atomicity. Our efficient implementation of Eleda is enabled by a highly concurrent data structure, Grasshopper, that eliminates a multicore scalability problem of centralized logging and enhances system utilization in the presence of synchronous I/O delay. We implemented Eleda and plugged it to WiredTiger and Shore-MT by replacing their log managers. Our evaluation showed that Eleda-based transaction systems improve performance up to $ 71 \times $, thus showing the applicability of Eleda.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bonifati:2017:ASL, author = "Angela Bonifati and Wim Martens and Thomas Timm", title = "An analytical study of large {SPARQL} query logs", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "2", pages = "149--161", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3149193.3149196", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 30 06:16:03 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the adoption of RDF as the data model for Linked Data and the Semantic Web, query specification from end-users has become more and more common in SPARQL endpoints. In this paper, we conduct an in-depth analytical study of the queries formulated by end-users and harvested from large and up-to-date query logs from a wide variety of RDF data sources. As opposed to previous studies, ours is the first assessment on a voluminous query corpus, spanning over several years and covering many representative SPARQL endpoints. Apart from the syntactical structure of the queries, that exhibits already interesting results on this generalized corpus, we drill deeper in the structural characteristics related to the graph and hypergraph representation of queries. We outline the most common shapes of queries when visually displayed as undirected graphs, and characterize their (hyper-)tree width. Moreover, we analyze the evolution of queries over time, by introducing the novel concept of a streak, i.e., a sequence of queries that appear as subsequent modifications of a seed query. Our study offers several fresh insights on the already rich query features of real SPARQL queries formulated by real users, and brings us to draw a number of conclusions and pinpoint future directions for SPARQL query evaluation, query optimization, tuning, and benchmarking.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2017:ACT, author = "Pinghui Wang and Yiyan Qi and Yu Sun and Xiangliang Zhang and Jing Tao and Xiaohong Guan", title = "Approximately counting triangles in large graph streams including edge duplicates with a fixed memory usage", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "2", pages = "162--175", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3149193.3149197", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 30 06:16:03 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Counting triangles in a large graph is important for detecting network anomalies such as spam web pages and suspicious accounts (e.g., fraudsters and advertisers) on online social networks. However, it is challenging to compute the number of triangles in a large graph represented as a stream of edges with a low computational cost when given a limited memory. Recently, several effective sampling-based approximation methods have been developed to solve this problem. However, they assume the graph stream of interest contains no duplicate edges, which does not hold in many real-world graph streams (e.g., phone calling networks). In this paper, we observe that these methods exhibit a large estimation error or computational cost even when modified to deal with duplicate edges using deduplication techniques such as Bloom filter and hash-based sampling. To solve this challenge, we design a one-pass streaming algorithm for uniformly sampling distinct edges at a high speed. Compared to state-of-the-art algorithms, our algorithm reduces the sampling cost per edge from O (log k ) ( k is the maximum number of sampled edges determined by the available memory space) to O (1) without using any additional memory space. Based on sampled edges, we develop a simple yet accurate method to infer the number of triangles in the original graph stream. We conduct extensive experiments on a variety of real-world large graphs, and the results demonstrate that our method is several times more accurate and faster than state-of-the-art methods with the same memory usage.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qiao:2017:SMC, author = "Miao Qiao and Hao Zhang and Hong Cheng", title = "Subgraph matching: on compression and computation", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "2", pages = "176--188", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3149193.3149198", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 30 06:16:03 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Subgraph matching finds a set I of all occurrences of a pattern graph in a target graph. It has a wide range of applications while suffers an expensive computation. This efficiency issue has been studied extensively. All existing approaches, however, turn a blind eye to the output crisis, that is, when the system has to materialize I as a preprocessing/intermediate/final result or an index, the cost of the export of I dominates the overall cost, which could be prohibitive even for a small pattern graph. This paper studies subgraph matching via two problems. (1) Is there an ideal compression of I? (2) Will the compression of I reversely boost the computation of I? For the problem (1), we propose a technique called VCBC to compress I to code(I) which serves effectively the same as I. For problem (2), we propose a subgraph matching computation framework CBF which computes code(I) instead of I to bring down the output cost. CBF further reduces the overall cost by reducing the intermediate results. Extensive experiments show that the compression ratio of VCBC can be up to $ 10^5 $ which also significantly lowers the output cost of CBF. Extensive experiments show the superior performance of CBF over existing approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Singh:2017:SEM, author = "Rohit Singh and Venkata Vamsikrishna Meduri and Ahmed Elmagarmid and Samuel Madden and Paolo Papotti and Jorge-Arnulfo Quian{\'e}-Ruiz and Armando Solar-Lezama and Nan Tang", title = "Synthesizing entity matching rules by examples", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "2", pages = "189--202", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3149193.3149199", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 30 06:16:03 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Entity matching (EM) is a critical part of data integration. We study how to synthesize entity matching rules from positive-negative matching examples. The core of our solution is program synthesis, a powerful tool to automatically generate rules (or programs) that satisfy a given high-level specification, via a predefined grammar. This grammar describes a General Boolean Formula ( GBF ) that can include arbitrary attribute matching predicates combined by conjunctions ($ \vee $), disjunctions ($ \wedge $) and negations ($ \isonot $), and is expressive enough to model EM problems, from capturing arbitrary attribute combinations to handling missing attribute values. The rules in the form of GBF are more concise than traditional EM rules represented in Disjunctive Normal Form ( DNF ). Consequently, they are more interpretable than decision trees and other machine learning algorithms that output deep trees with many branches. We present a new synthesis algorithm that, given only positive-negative examples as input, synthesizes EM rules that are effective over the entire dataset. Extensive experiments show that we outperform other interpretable rules (e.g., decision trees with low depth) in effectiveness, and are comparable with non-interpretable tools (e.g., decision trees with high depth, gradient-boosting trees, random forests and SVM).", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{He:2017:SST, author = "Liang He and Bin Shao and Yatao Li and Huanhuan Xia and Yanghua Xiao and Enhong Chen and Liang Jeff Chen", title = "{Stylus}: a strongly-typed store for serving massive {RDF} data", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "2", pages = "203--216", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3149193.3149200", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 30 06:16:03 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "RDF is one of the most commonly used knowledge representation forms. Many highly influential knowledge bases, such as Freebase and PubChemRDF, are in RDF format. An RDF data set is usually represented as a collection of subject-predicate-object triples. Despite the flexibility of RDF triples, it is challenging to serve SPARQL queries on RDF data efficiently by directly managing triples due to the following two reasons. First, heavy joins on a large number of triples are needed for query processing, resulting in a large number of data scans and large redundant intermediate results; Second, weakly-typed triple representation provides suboptimal random access --- typically with logarithmic complexity. This data access challenge, unfortunately, cannot be easily met by a better query optimizer as large graph processing is extremely I/O-intensive. In this paper, we argue that strongly-typed graph representation is the key to high-performance RDF query processing. We propose Stylus --- a strongly-typed store for serving massive RDF data. Stylus exploits a strongly-typed storage scheme to boost the performance of RDF query processing. The storage scheme is essentially a materialized join view on entities, it thus can eliminate a large number of unnecessary joins on triples. Moreover, it is equipped with a compact representation for intermediate results and an efficient graph-decomposition based query planner. Experimental results on both synthetic and real-life RDF data sets confirm that the proposed approach can dramatically boost the performance of SPARQL query processing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ioannou:2017:HQE, author = "Ekaterini Ioannou and Minos Garofalakis", title = "Holistic query evaluation over information extraction pipelines", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "2", pages = "217--229", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3149193.3149201", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 30 06:16:03 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We introduce holistic in-database query processing over information extraction pipelines. This requires considering the joint conditional distribution over generic Conditional Random Fields that uses factor graphs to encode extraction tasks. Our approach introduces Canopy Factor Graphs, a novel probabilistic model for effectively capturing the joint conditional distribution given a canopy clustering of the data, and special query operators for retrieving resolution information. Since inference on such models is intractable, we introduce an approximate technique for query processing and optimizations that cut across the integrated tasks for reducing the required processing time. Effectiveness and scalability are verified through an extensive experimental evaluation using real and synthetic data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Psaropoulos:2017:ICP, author = "Georgios Psaropoulos and Thomas Legler and Norman May and Anastasia Ailamaki", title = "Interleaving with coroutines: a practical approach for robust index joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "2", pages = "230--242", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3149193.3149202", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 30 06:16:03 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Index join performance is determined by the efficiency of the lookup operation on the involved index. Although database indexes are highly optimized to leverage processor caches, main memory accesses inevitably increase lookup runtime when the index outsizes the last-level cache; hence, index join performance drops. Still, robust index join performance becomes possible with instruction stream interleaving: given a group of lookups, we can hide cache misses in one lookup with instructions from other lookups by switching among their respective instruction streams upon a cache miss. In this paper, we propose interleaving with coroutines for any type of index join. We showcase our proposal on SAP HANA by implementing binary search and CSB$^+$-tree traversal for an instance of index join related to dictionary compression. Coroutine implementations not only perform similarly to prior interleaving techniques, but also resemble the original code closely, while supporting both interleaved and non-interleaved execution. Thus, we claim that coroutines make interleaving practical for use in real DBMS codebases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wen:2017:ESG, author = "Dong Wen and Lu Qin and Ying Zhang and Lijun Chang and Xuemin Lin", title = "Efficient structural graph clustering: an index-based approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "3", pages = "243--255", month = nov, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3157794.3157795", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Dec 11 16:07:56 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph clustering is a fundamental problem widely experienced across many industries. The structural graph clustering (SCAN) method obtains not only clusters but also hubs and outliers. However, the clustering results closely depend on two sensitive parameters, $ \epsilon $ and $ \mu $, while the optimal parameter setting depends on different graph properties and various user requirements. Moreover, all existing SCAN solutions need to scan at least the whole graph, even if only a small number of vertices belong to clusters. In this paper we propose an index-based method for SCAN. Based on our index, we cluster the graph for any $ \epsilon $ and $ \mu $ in $ O(\Sigma_{c \epsilon C} |E_C|) $ time, where $C$ is the result set of all clusters and $ | E_C |$ is the number of edges in a specific cluster $C$. In other words, the time expended to compute structural clustering depends only on the result size, not on the size of the original graph. Our index's space complexity is bounded by $ O(m)$, where $m$ is the number of edges in the graph. To handle dynamic graph updates, we propose algorithms and several optimization techniques for maintaining our index. We conduct extensive experiments to practically evaluate the performance of all our proposed algorithms on 10 real-world networks, one of which contains more than 1 billion edges. The experimental results demonstrate that our approaches significantly outperform existing solutions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{DeCapitanidiVimercati:2017:AMM, author = "Sabrina {De Capitani di Vimercati} and Sara Foresti and Sushil Jajodia and Giovanni Livraga and Stefano Paraboschi and Pierangela Samarati", title = "An authorization model for multi provider queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "3", pages = "256--268", month = nov, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3157794.3157796", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Dec 11 16:07:56 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present a novel approach for the specification and enforcement of authorizations that enables controlled data sharing for collaborative queries in the cloud. Data authorities can establish authorizations regulating access to their data distinguishing three visibility levels (no visibility, encrypted visibility, and plaintext visibility). Authorizations are enforced in the query execution by possibly restricting operation assignments to other parties and by adjusting visibility of data on-the-fly. Our approach enables users and data authorities to fully enjoy the benefits and economic savings of the competitive open cloud market, while maintaining control over data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ratner:2017:SRT, author = "Alexander Ratner and Stephen H. Bach and Henry Ehrenberg and Jason Fries and Sen Wu and Christopher R{\'e}", title = "{Snorkel}: rapid training data creation with weak supervision", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "3", pages = "269--282", month = nov, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3157794.3157797", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Dec 11 16:07:56 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Labeling training data is increasingly the largest bottleneck in deploying machine learning systems. We present Snorkel, a first-of-its-kind system that enables users to train state-of-the-art models without hand labeling any training data. Instead, users write labeling functions that express arbitrary heuristics, which can have unknown accuracies and correlations. Snorkel denoises their outputs without access to ground truth by incorporating the first end-to-end implementation of our recently proposed machine learning paradigm, data programming. We present a flexible interface layer for writing labeling functions based on our experience over the past year collaborating with companies, agencies, and research labs. In a user study, subject matter experts build models $ 2.8 \times $ faster and increase predictive performance an average 45.5\% versus seven hours of hand labeling. We study the modeling tradeoffs in this new setting and propose an optimizer for automating tradeoff decisions that gives up to $ 1.8 \times $ speedup per pipeline execution. In two collaborations, with the U.S. Department of Veterans Affairs and the U.S. Food and Drug Administration, and on four open-source text and image data sets representative of other deployments, Snorkel provides 132\% average improvements to predictive performance over prior heuristic approaches and comes within an average 3.60\% of the predictive performance of large hand-curated training sets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2017:VPV, author = "Yuliang Li and Alin Deutsch and Victor Vianu", title = "{VERIFAS}: a practical verifier for artifact systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "3", pages = "283--296", month = nov, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3157794.3157798", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Dec 11 16:07:56 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data-driven workflows, of which IBM's Business Artifacts are a prime exponent, have been successfully deployed in practice, adopted in industrial standards, and have spawned a rich body of research in academia, focused primarily on static analysis. The present research bridges the gap between the theory and practice of artifact verification with VERIFAS, the first implementation of practical significance of an artifact verifier with full support for unbounded data. VERIFAS verifies within seconds linear-time temporal properties over real-world and synthetic workflows of complexity in the range recommended by software engineering practice. Compared to our previous implementation based on the widely-used Spin model checker, VERIFAS not only supports a model with richer data manipulations but also outperforms it by over an order of magnitude. VERIFAS' good performance is due to a novel symbolic representation approach and a family of specialized optimizations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jia:2017:DMG, author = "Zhihao Jia and Yongkee Kwon and Galen Shipman and Pat McCormick and Mattan Erez and Alex Aiken", title = "A distributed multi-{GPU} system for fast graph processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "3", pages = "297--310", month = nov, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3157794.3157799", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Dec 11 16:07:56 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present Lux, a distributed multi-GPU system that achieves fast graph processing by exploiting the aggregate memory bandwidth of multiple GPUs and taking advantage of locality in the memory hierarchy of multi-GPU clusters. Lux provides two execution models that optimize algorithmic efficiency and enable important GPU optimizations, respectively. Lux also uses a novel dynamic load balancing strategy that is cheap and achieves good load balance across GPUs. In addition, we present a performance model that quantitatively predicts the execution times and automatically selects the runtime configurations for Lux applications. Experiments show that Lux achieves up to 20X speedup over state-of-the-art shared memory systems and up to two orders of magnitude speedup over distributed systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bleifuss:2017:EDC, author = "Tobias Bleifu{\ss} and Sebastian Kruse and Felix Naumann", title = "Efficient denial constraint discovery with {Hydra}", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "3", pages = "311--323", month = nov, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3157794.3157800", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Dec 11 16:07:56 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Denial constraints (DCs) are a generalization of many other integrity constraints (ICs) widely used in databases, such as key constraints, functional dependencies, or order dependencies. Therefore, they can serve as a unified reasoning framework for all of these ICs and express business rules that cannot be expressed by the more restrictive IC types. The process of formulating DCs by hand is difficult, because it requires not only domain expertise but also database knowledge, and due to DCs' inherent complexity, this process is tedious and error-prone. Hence, an automatic DC discovery is highly desirable: we search for all valid denial constraints in a given database instance. However, due to the large search space, the problem of DC discovery is computationally expensive. We propose a new algorithm Hydra, which overcomes the quadratic runtime complexity in the number of tuples of state-of-the-art DC discovery methods. The new algorithm's experimentally determined runtime grows only linearly in the number of tuples. This results in a speedup by orders of magnitude, especially for datasets with a large number of tuples. Hydra can deliver results in a matter of seconds that to date took hours to compute.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Azim:2017:RRC, author = "Tahir Azim and Manos Karpathiotakis and Anastasia Ailamaki", title = "{ReCache}: reactive caching for fast analytics over heterogeneous data", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "3", pages = "324--337", month = nov, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3157794.3157801", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Dec 11 16:07:56 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As data continues to be generated at exponentially growing rates in heterogeneous formats, fast analytics to extract meaningful information is becoming increasingly important. Systems widely use in-memory caching as one of their primary techniques to speed up data analytics. However, caches in data analytics systems cannot rely on simple caching policies and a fixed data layout to achieve good performance. Different datasets and workloads require different layouts and policies to achieve optimal performance. This paper presents ReCache, a cache-based performance accelerator that is reactive to the cost and heterogeneity of diverse raw data formats. Using timing measurements of caching operations and selection operators in a query plan, ReCache accounts for the widely varying costs of reading, parsing, and caching data in nested and tabular formats. Combining these measurements with information about frequently accessed data fields in the workload, ReCache automatically decides whether a nested or relational column-oriented layout would lead to better query performance. Furthermore, ReCache keeps track of commonly utilized operators to make informed cache admission and eviction decisions. Experiments on synthetic and real-world datasets show that our caching techniques decrease caching overhead for individual queries by an average of 59\%. Furthermore, over the entire workload, ReCache reduces execution time by 19-75\% compared to existing techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yuan:2017:EED, author = "Long Yuan and Lu Qin and Xuemin Lin and Lijun Chang and Wenjie Zhang", title = "Effective and efficient dynamic graph coloring", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "3", pages = "338--351", month = nov, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3157794.3157802", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Dec 11 16:07:56 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph coloring is a fundamental graph problem that is widely applied in a variety of applications. The aim of graph coloring is to minimize the number of colors used to color the vertices in a graph such that no two incident vertices have the same color. Existing solutions for graph coloring mainly focus on computing a good coloring for a static graph. However, since many real-world graphs are highly dynamic, in this paper, we aim to incrementally maintain the graph coloring when the graph is dynamically updated. We target on two goals: high effectiveness and high efficiency. To achieve high effectiveness, we maintain the graph coloring in a way such that the coloring result is consistent with one of the best static graph coloring algorithms for large graphs. To achieve high efficiency, we investigate efficient incremental algorithms to update the graph coloring by exploring a small number of vertices. We design a color-propagation based algorithm which only explores the vertices within the 2-hop neighbors of the update-related and color-changed vertices. We then propose a novel color index to maintain some summary color information and, thus, bound the explored vertices within the neighbors of these vertices. Moreover, we derive some effective pruning rules to further reduce the number of propagated vertices. The experimental results demonstrate the high effectiveness and efficiency of our approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zacharatou:2017:GRR, author = "Eleni Tzirita Zacharatou and Harish Doraiswamy and Anastasia Ailamaki and Cl{\'a}udio T. Silva and Juliana Freiref", title = "{GPU} rasterization for real-time spatial aggregation over arbitrary polygons", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "3", pages = "352--365", month = nov, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3157794.3157803", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Dec 11 16:07:56 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Visual exploration of spatial data relies heavily on spatial aggregation queries that slice and summarize the data over different regions. These queries comprise computationally-intensive point-in-polygon tests that associate data points to polygonal regions, challenging the responsiveness of visualization tools. This challenge is compounded by the sheer amounts of data, requiring a large number of such tests to be performed. Traditional pre-aggregation approaches are unsuitable in this setting since they fix the query constraints and support only rectangular regions. On the other hand, query constraints are defined interactively in visual analytics systems, and polygons can be of arbitrary shapes. In this paper, we convert a spatial aggregation query into a set of drawing operations on a canvas and leverage the rendering pipeline of the graphics hardware (GPU) to enable interactive response times. Our technique trades-off accuracy for response time by adjusting the canvas resolution, and can even provide accurate results when combined with a polygon index. We evaluate our technique on two large real-world data sets, exhibiting superior performance compared to index-based approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shah:2017:KFK, author = "Vraj Shah and Arun Kumar and Xiaojin Zhu", title = "Are key--foreign key joins safe to avoid when learning high-capacity classifiers?", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "3", pages = "366--379", month = nov, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3157794.3157804", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Dec 11 16:07:56 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Machine learning (ML) over relational data is a booming area of data management. While there is a lot of work on scalable and fast ML systems, little work has addressed the pains of sourcing data for ML tasks. Real-world relational databases typically have many tables (often, dozens) and data scientists often struggle to even obtain all tables for joins before ML. In this context, Kumar et al. showed recently that key-foreign key dependencies (KFKDs) between tables often lets us avoid such joins without significantly affecting prediction accuracy-an idea they called ``avoiding joins safely.'' While initially controversial, this idea has since been used by multiple companies to reduce the burden of data sourcing for ML. But their work applied only to linear classifiers. In this work, we verify if their results hold for three popular high-capacity classifiers: decision trees, non-linear SVMs, and ANNs. We conduct an extensive experimental study using both real-world datasets and simulations to analyze the effects of avoiding KFK joins on such models. Our results show that these high-capacity classifiers are surprisingly and counter-intuitively more robust to avoiding KFK joins compared to linear classifiers, refuting an intuition from the prior work's analysis. We explain this behavior intuitively and identify open questions at the intersection of data management and ML theoretical research. All of our code and datasets are available for download from http://cseweb.ucsd.edu/~arunkk/hamlet.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2017:WRC, author = "Zheng Liu and Lei Chen", title = "Worker recommendation for crowdsourced {Q\&A} services: a triple-factor aware approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "3", pages = "380--392", month = nov, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3157794.3157805", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Dec 11 16:07:56 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Worker Recommendation (WR) is one of the most important functions for crowdsourced Q\&A services. Specifically, given a set of tasks to be solved, WR recommends each task with a certain group of workers, whom are expected to give timely answers with high qualities. To address the WR problem, recent studies have introduced a number of recommendation approaches, which take advantage of workers' expertises or preferences towards different types of tasks. However, without a thorough consideration of workers' characters, such approaches will lead to either inadequate task fulfillment or inferior answer quality. In this work, we propose the Triple-factor Aware Worker Recommendation framework, which collectively considers workers' expertises, preferences and activenesses to maximize the overall production of high quality answers. We construct the Latent Hierarchical Factorization Model, which is able to infer the tasks' underlying categories and workers' latent characters from the historical data; and we propose a novel parameter inference method, which only requires the processing of positive instances, giving rise to significantly higher time efficiency and better inference quality. What's more, the sampling-based recommendation algorithm is developed, such that the near optimal worker recommendation can be generated for a presented batch of tasks with considerably reduced time consumption. Comprehensive experiments have been carried out using both real and synthetic datasets, whose results verify the effectiveness and efficiency of our proposed methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gong:2017:CSD, author = "Shufeng Gong and Yanfeng Zhang and Ge Yu", title = "Clustering stream data by exploring the evolution of density mountain", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "4", pages = "393--405", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3164135.3164136", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Feb 15 16:29:05 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Stream clustering is a fundamental problem in many streaming data analysis applications. Comparing to classical batch-mode clustering, there are two key challenges in stream clustering: (i) Given that input data are changing continuously, how to incrementally update their clustering results efficiently? (ii) Given that clusters continuously evolve with the evolution of data, how to capture the cluster evolution activities? Unfortunately, most of existing stream clustering algorithms can neither update the cluster result in real-time nor track the evolution of clusters. In this paper, we propose a stream clustering algorithm EDMStream by exploring the Evolution of Density Mountain. The density mountain is used to abstract the data distribution, the changes of which indicate data distribution evolution. We track the evolution of clusters by monitoring the changes of density mountains. We further provide efficient data structures and filtering schemes to ensure that the update of density mountains is in real-time, which makes online clustering possible. The experimental results on synthetic and real datasets show that, comparing to the state-of-the-art stream clustering algorithms, e.g., D-Stream, DenStream, DBSTREAM and MR-Stream, our algorithm is able to response to a cluster update much faster (say 7-15x faster than the best of the competitors) and at the same time achieve comparable cluster quality. Furthermore, EDMStream successfully captures the cluster evolution activities.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2017:QFL, author = "Tianzheng Wang and Ryan Johnson and Ippokratis Pandis", title = "Query fresh: log shipping on steroids", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "4", pages = "406--419", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3164135.3164137", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Feb 15 16:29:05 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Hot standby systems often have to trade safety (i.e., not losing committed work) and freshness (i.e., having access to recent updates) for performance. Guaranteeing safety requires synchronous log shipping that blocks the primary until the log records are durably replicated in one or multiple backups; maintaining freshness necessitates fast log replay on backups, but is often defeated by the dual-copy architecture and serial replay: a backup must generate the ``real'' data from the log to make recent updates accessible to read-only queries. This paper proposes Query Fresh, a hot standby system that provides both safety and freshness while maintaining high performance on the primary. The crux is an append-only storage architecture used in conjunction with fast networks (e.g., InfiniBand) and byte-addressable, non-volatile memory (NVRAM). Query Fresh avoids the dual-copy design and treats the log as the database, enabling lightweight, parallel log replay that does not block the primary. Experimental results using the TPC-C benchmark show that under Query Fresh, backup servers can replay log records faster than they are generated by the primary server, using one quarter of the available compute resources. With a 56Gbps network, Query Fresh can support up to 4--5 synchronous replicas, each of which receives and replays $ \approx $1.4GB of log records per second, with up to 4--6\% overhead on the primary compared to a standalone server that achieves 620kTPS without replication.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sahu:2017:ULG, author = "Siddhartha Sahu and Amine Mhedhbi and Semih Salihoglu and Jimmy Lin and M. Tamer {\"O}zsu", title = "The ubiquity of large graphs and surprising challenges of graph processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "4", pages = "420--431", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3164135.3164139", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Feb 15 16:29:05 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph processing is becoming increasingly prevalent across many application domains. In spite of this prevalence, there is little research about how graphs are actually used in practice. We conducted an online survey aimed at understanding: (i) the types of graphs users have; (ii) the graph computations users run; (iii) the types of graph software users use; and (iv) the major challenges users face when processing their graphs. We describe the participants' responses to our questions highlighting common patterns and challenges. We further reviewed user feedback in the mailing lists, bug reports, and feature requests in the source repositories of a large suite of software products for processing graphs. Through our review, we were able to answer some new questions that were raised by participants' responses and identify specific challenges that users face when using different classes of graph software. The participants' responses and data we obtained revealed surprising facts about graph processing in practice. In particular, real-world graphs represent a very diverse range of entities and are often very large, and scalability and visualization are undeniably the most pressing challenges faced by participants. We hope these findings can guide future research.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ramachandra:2017:FOI, author = "Karthik Ramachandra and Kwanghyun Park and K. Venkatesh Emani and Alan Halverson and C{\'e}sar Galindo-Legaria and Conor Cunningham", title = "{Froid}: optimization of imperative programs in a relational database", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "4", pages = "432--444", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3164135.3164140", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Feb 15 16:29:05 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "For decades, RDBMSs have supported declarative SQL as well as imperative functions and procedures as ways for users to express data processing tasks. While the evaluation of declarative SQL has received a lot of attention resulting in highly sophisticated techniques, the evaluation of imperative programs has remained na{\"\i}ve and highly inefficient. Imperative programs offer several benefits over SQL and hence are often preferred and widely used. But unfortunately, their abysmal performance discourages, and even prohibits their use in many situations. We address this important problem that has hitherto received little attention. We present Froid, an extensible framework for optimizing imperative programs in relational databases. Froid's novel approach automatically transforms entire User Defined Functions (UDFs) into relational algebraic expressions, and embeds them into the calling SQL query. This form is now amenable to cost-based optimization and results in efficient, set-oriented, parallel plans as opposed to inefficient, iterative, serial execution of UDFs. Froid's approach additionally brings the benefits of many compiler optimizations to UDFs with no additional implementation effort. We describe the design of Froid and present our experimental evaluation that demonstrates performance improvements of up to multiple orders of magnitude on real workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2017:ESH, author = "Ye Li and Leong Hou U. and Man Lung Yiu and Ngai Meng Kou", title = "An experimental study on hub labeling based shortest path algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "4", pages = "445--457", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3164135.3164141", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Feb 15 16:29:05 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Shortest path distance retrieval is a core component in many important applications. For a decade, hub labeling (HL) techniques have been considered as a practical solution with fast query response time (e.g., 1--3 orders of magnitude faster), competitive indexing time, and slightly larger storage overhead (e.g., several times larger). These techniques enhance query throughput up to hundred thousands queries per second, which is particularly helpful in large user environment. Despite the importance of HL techniques, we are not aware of any comprehensive experimental study on HL techniques. Thus it is difficult for a practitioner to adopt HL techniques for her applications. To address the above issues, we provide a comprehensive experimental study on the state-of-the-art HL technique with analysis of their efficiency, effectiveness and applicability. From insightful summary of different HL techniques, we further develop a simple yet effective HL techniques called Significant path based Hub Pushing (SHP) which greatly improves indexing time of previous techniques while retains good query performance. We also complement extensive comparisons between HL techniques and other shortest path solutions to demonstrate robustness and efficiency of HL techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Merritt:2017:CLS, author = "Alexander Merritt and Ada Gavrilovska and Yuan Chen and Dejan Milojicic", title = "Concurrent log-structured memory for many-core key--value stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "4", pages = "458--471", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3164135.3164142", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Feb 15 16:29:05 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Key-value stores are an important tool in managing and accessing large in-memory data sets. As many applications benefit from having as much of their working state fit into main memory, an important design of the memory management of modern key--value stores is the use of log-structured approaches, enabling efficient use of the memory capacity, by compacting objects to avoid fragmented states. However, with the emergence of thousand-core and peta-byte memory platforms (DRAM or future storage-class memories) log-structured designs struggle to scale, preventing parallel applications from exploiting the full capabilities of the hardware: careful coordination is required for background activities (compacting and organizing memory) to remain asynchronous with respect to the use of the interface, and for insertion operations to avoid contending for centralized resources such as the log head and memory pools. In this work, we present the design of a log-structured key--value store called Nibble that incorporates a multi-head log for supporting concurrent writes, a novel distributed epoch mechanism for scalable memory reclamation, and an optimistic concurrency index. We implement Nibble in the Rust language in ca. 4000 lines of code, and evaluate it across a variety of data-serving workloads on a 240-core cache-coherent server. Our measurements show Nibble scales linearly in uniform YCSB workloads, matching competitive non-log-structured key--value stores for write- dominated traces at 50 million operations per second on 1 TiB-sized working sets. Our memory analysis shows Nibble is efficient, requiring less than 10\% additional capacity, whereas memory use by non-log-structured key--value store designs may be as high as 2x.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ceccarello:2017:CUG, author = "Matteo Ceccarello and Carlo Fantozzi and Andrea Pietracaprina and Geppino Pucci and Fabio Vandin", title = "Clustering uncertain graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "4", pages = "472--484", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3164135.3164143", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Feb 15 16:29:05 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "An uncertain graph $ G = (V, E, p : E \to (0, 1]) $ can be viewed as a probability space whose outcomes (referred to as possible worlds ) are subgraphs of $G$ where any edge $ e \epsilon E$ occurs with probability $ p(e)$, independently of the other edges. These graphs naturally arise in many application domains where data management systems are required to cope with uncertainty in interrelated data, such as computational biology, social network analysis, network reliability, and privacy enforcement, among the others. For this reason, it is important to devise fundamental querying and mining primitives for uncertain graphs. This paper contributes to this endeavor with the development of novel strategies for clustering uncertain graphs. Specifically, given an uncertain graph $G$ and an integer $k$, we aim at partitioning its nodes into $k$ clusters, each featuring a distinguished center node, so to maximize the minimum/average connection probability of any node to its cluster's center, in a random possible world. We assess the NP-hardness of maximizing the minimum connection probability, even in the presence of an oracle for the connection probabilities, and develop efficient approximation algorithms for both problems and some useful variants. Unlike previous works in the literature, our algorithms feature provable approximation guarantees and are capable to keep the granularity of the returned clustering under control. Our theoretical findings are complemented with several experiments that compare our algorithms against some relevant competitors, with respect to both running-time and quality of the returned clusterings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abdelaziz:2017:LSQ, author = "Ibrahim Abdelaziz and Essam Mansour and Mourad Ouzzani and Ashraf Aboulnaga and Panos Kalnis", title = "{Lusail}: a system for querying linked data at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "4", pages = "485--498", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3164135.3164144", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Feb 15 16:29:05 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The RDF data model allows publishing interlinked RDF datasets, where each dataset is independently maintained and is queryable via a SPARQL endpoint. Many applications would benefit from querying the resulting large, decentralized, geo-distributed graph through a federated SPARQL query processor. A crucial factor for good performance in federated query processing is pushing as much computation as possible to the local endpoints. Surprisingly, existing federated SPARQL engines are not effective at this task since they rely only on schema information. Consequently, they cause unnecessary data retrieval and communication, leading to poor scalability and response time. This paper addresses these limitations and presents Lusail, a scalable and efficient federated SPARQL system for querying large RDF graphs that are geo-distributed on different endpoints. Lusail uses a novel query rewriting algorithm to push computation to the local endpoints by relying on information about the RDF instances and not only the schema. The query rewriting algorithm has the additional advantage of exposing parallelism in query processing, which Lusail exploits through advanced scheduling at query run time. Our experiments on billions of triples of real and synthetic data show that Lusail outperforms state-of-the-art systems by orders of magnitude in terms of scalability and response time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Harmouch:2017:CEE, author = "Hazar Harmouch and Felix Naumann", title = "Cardinality estimation: an experimental survey", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "4", pages = "499--512", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3164135.3164145", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Feb 15 16:29:05 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data preparation and data profiling comprise many both basic and complex tasks to analyze a dataset at hand and extract metadata, such as data distributions, key candidates, and functional dependencies. Among the most important types of metadata is the number of distinct values in a column, also known as the zeroth-frequency moment. Cardinality estimation itself has been an active research topic in the past decades due to its many applications. The aim of this paper is to review the literature of cardinality estimation and to present a detailed experimental study of twelve algorithms, scaling far beyond the original experiments. First, we outline and classify approaches to solve the problem of cardinality estimation --- we describe their main idea, error-guarantees, advantages, and disadvantages. Our experimental survey then compares the performance all twelve cardinality estimation algorithms. We evaluate the algorithms' accuracy, runtime, and memory consumption using synthetic and real-world datasets. Our results show that different algorithms excel in different in categories, and we highlight their trade-offs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Park:2017:SSL, author = "Jong-Hyeok Park and Gihwan Oh and Sang-Won Lee", title = "{SQL} statement logging for making {SQLite} truly lite", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "4", pages = "513--525", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3164135.3164146", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Feb 15 16:29:05 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The lightweight codebase of SQLite was helpful in making it become the de-facto standard database in most mobile devices, but, at the same time, forced it to take less-complicated transactional schemes, such as physical page logging, journaling, and force commit, which in turn cause excessive write amplification. Thus, the write IO cost in SQLite is not lightweight at all. In this paper, to make SQLite truly lite in terms of IO efficiency for the transactional support, we propose SQLite/SSL, a per-transaction SQL statement logging scheme: when a transaction commits, SQLite/SSL ensures its durability by storing only SQL statements of small size, thus writing less and performing faster at no compromise of transactional solidity. Our main contribution is to show that, based on the observation that mobile transactions tend to be short and exhibit strong update locality, logical logging can, though long discarded, become an elegant and perfect fit for SQLite-based mobile applications. Further, we leverage the WAL journal mode in vanilla SQLite as a transaction-consistent checkpoint mechanism which is indispensable in any logical logging scheme. In addition, we show for the first time that byte-addressable NVM (non-volatile memory) in host-side can realize the full potential of logical logging because it allows to store fine-grained logs quickly. We have prototyped SQLite/SSL by augmenting vanilla SQLite with a transaction-consistent checkpoint mechanism and a redo-only recovery logic, and have evaluated its performance using a set of synthetic and real workloads. When a real NVM board is used as its log device, SQLite/SSL can outperform vanilla SQLite's WAL mode by up to 300x and also outperform the state-of-the-arts SQLite/PPL scheme by several folds in terms of IO time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", remark = "The speedups come from employing nonvolatile memory (which costs about 10 times as much as DRAM) for database updates, and delaying writes to the filesystem, which is important for SSD devices that have limited write life. The target platform is mobile devices. There is no mention of whether the extensions to the public-domain SQLite3 code are available to others.", } @Article{Johnson:2018:TPD, author = "Noah Johnson and Joseph P. Near and Dawn Song", title = "Towards practical differential privacy for {SQL} queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "5", pages = "526--539", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177732.3177733", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 17 07:25:04 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Differential privacy promises to enable general data analytics while protecting individual privacy, but existing differential privacy mechanisms do not support the wide variety of features and databases used in real-world SQL-based analytics systems. This paper presents the first practical approach for differential privacy of SQL queries. Using 8.1 million real-world queries, we conduct an empirical study to determine the requirements for practical differential privacy, and discuss limitations of previous approaches in light of these requirements. To meet these requirements we propose elastic sensitivity, a novel method for approximating the local sensitivity of queries with general equijoins. We prove that elastic sensitivity is an upper bound on local sensitivity and can therefore be used to enforce differential privacy using any local sensitivity-based mechanism. We build FLEX, a practical end-to-end system to enforce differential privacy for SQL queries using elastic sensitivity. We demonstrate that FLEX is compatible with any existing database, can enforce differential privacy for real-world SQL queries, and incurs negligible (0.03\%) performance overhead.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shraer:2018:CSS, author = "Alexander Shraer and Alexandre Aybes and Bryan Davis and Christos Chrysafis and Dave Browning and Eric Krugler and Eric Stone and Harrison Chandler and Jacob Farkas and John Quinn and Jonathan Ruben and Michael Ford and Mike McMahon and Nathan Williams and Nicolas Favre-Felix and Nihar Sharma and Ori Herrnstadt and Paul Seligman and Raghav Pisolkar and Scott Dugas and Scott Gray and Sytze Harkema and Valentin Kravtsov and Vanessa Hong and Wan Ling Yih and Yizuo Tian", title = "{Cloudkit}: structured storage for mobile applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "5", pages = "540--552", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3164135.3164138", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 17 07:25:04 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "CloudKit is Apple's cloud backend service and application development framework that provides strongly-consistent storage for structured data and makes it easy to synchronize data across user devices or share it among multiple users. Launched more than 3 years ago, CloudKit forms the foundation for more than 50 Apple apps, including many of our most important and popular applications such as Photos, iCloud Drive, Notes, Keynote, and News, as well as many third-party apps. To deliver this at large scale, CloudKit explicitly leverages multi-tenancy at the application level as well as at the user level to guide efficient data placement and distribution. By using CloudKit application developers are free to focus on delivering the application front-end and logic while relying on CloudKit for scale, consistency, durability and security. CloudKit manages petabytes of data and handles hundreds of millions of users around the world on a daily basis.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Arulraj:2018:BHP, author = "Joy Arulraj and Justin Levandoski and Umar Farooq Minhas and Per-Ake Larson", title = "{Bztree}: a high-performance latch-free range index for non-volatile memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "5", pages = "553--565", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3164135.3164147", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 17 07:25:04 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Storing a database (rows and indexes) entirely in non-volatile memory (NVM) potentially enables both high performance and fast recovery. To fully exploit parallelism on modern CPUs, modern main-memory databases use latch-free (lock-free) index structures, e.g. Bw-tree or skip lists. To achieve high performance NVM-resident indexes also need to be latch-free. This paper describes the design of the BzTree, a latch-free B-tree index designed for NVM. The BzTree uses a persistent multi-word compare-and-swap operation (PMwCAS) as a core building block, enabling an index design that has several important advantages compared with competing index structures such as the Bw-tree. First, the BzTree is latch-free yet simple to implement. Second, the BzTree is fast --- showing up to 2x higher throughput than the Bw-tree in our experiments. Third, the BzTree does not require any special-purpose recovery code. Recovery is near-instantaneous and only involves rolling back (or forward) any PMwCAS operations that were in-flight during failure. Our end-to-end recovery experiments of BzTree report an average recovery time of 145 $ \mu $ s. Finally, the same BzTree implementation runs seamlessly on both volatile RAM and NVM, which greatly reduces the cost of code maintenance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2018:FFP, author = "Yuzhen Huang and Tatiana Jin and Yidi Wu and Zhenkun Cai and Xiao Yan and Fan Yang and Jinfeng Li and Yuying Guo and James Cheng", title = "{FlexPS}: flexible parallelism control in parameter server architecture", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "5", pages = "566--579", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177732.3177734", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 17 07:25:04 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As a general abstraction for coordinating the distributed storage and access of model parameters, the parameter server (PS) architecture enables distributed machine learning to handle large datasets and high dimensional models. Many systems, such as Parameter Server and Petuum, have been developed based on the PS architecture and widely used in practice. However, none of these systems supports changing parallelism during runtime, which is crucial for the efficient execution of machine learning tasks with dynamic workloads. We propose a new system, called FlexPS, which introduces a novel multi-stage abstraction to support flexible parallelism control. With the multi-stage abstraction, a machine learning task can be mapped to a series of stages and the parallelism for a stage can be set according to its workload. Optimizations such as stage scheduler, stage-aware consistency controller, and direct model transfer are proposed for the efficiency of multi-stage machine learning in FlexPS. As a general and complete PS systems, FlexPS also incorporates many optimizations that are not limited to multi-stage machine learning. We conduct extensive experiments using a variety of machine learning workloads, showing that FlexPS achieves significant speedups and resource saving compared with the state-of-the-art PS systems such as Petuum and Multiverso.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yaghmazadeh:2018:AMH, author = "Navid Yaghmazadeh and Xinyu Wang and Isil Dillig", title = "Automated migration of hierarchical data to relational tables using programming-by-example", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "5", pages = "580--593", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177732.3177735", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 17 07:25:04 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "While many applications export data in hierarchical formats like XML and JSON, it is often necessary to convert such hierarchical documents to a relational representation. This paper presents a novel programming-by-example approach, and its implementation in a tool called Mitra, for automatically migrating tree-structured documents to relational tables. We have evaluated the proposed technique using two sets of experiments. In the first experiment, we used Mitra to automate 98 data transformation tasks collected from StackOverflow. Our method can generate the desired program for 94\% of these benchmarks with an average synthesis time of 3.8 seconds. In the second experiment, we used Mitra to generate programs that can convert real-world XML and JSON datasets to full-fledged relational databases. Our evaluation shows that Mitra can automate the desired transformation for all datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Luo:2018:TTO, author = "Siqiang Luo and Ben Kao and Guoliang Li and Jiafeng Hu and Reynold Cheng and Yudian Zheng", title = "{TOAIN}: a throughput optimizing adaptive index for answering dynamic {$k$ NN} queries on road networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "5", pages = "594--606", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177732.3177736", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 17 07:25:04 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the classical k NN queries on road networks. Existing solutions mostly focus on reducing query processing time. In many applications, however, system throughput is a more important measure. We devise a mathematical model that describes throughput in terms of a number of system characteristics. We show that query time is only one of the many parameters that impact throughput. Others include update time and query/update arrival rates. We show that the traditional approach of improving query time alone is generally inadequate in optimizing throughput. Moreover, existing solutions lack flexibility in adapting to environments of different characteristics. We propose Toain, which is a very flexible algorithm that can be easily trained to adapt to a given environment for maximizing query throughput. We conduct extensive experiments on both real and synthetic data and show that Toain gives significantly higher throughput compared with existing solutions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2018:EMT, author = "Tian Li and Jie Zhong and Ji Liu and Wentao Wu and Ce Zhang", title = "{Ease.ml}: towards multi-tenant resource sharing for machine learning workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "5", pages = "607--620", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177732.3177737", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 17 07:25:04 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present ease.ml, a declarative machine learning service platform. With ease.ml, a user defines the high-level schema of an ML application and submits the task via a Web interface. The system then deals with the rest, such as model selection and data movement. The ultimate question we hope to understand is that, as a ``service provider'' that manages a shared cluster of machines running machine learning workloads, what is the resource sharing strategy that maximizes the global satisfaction of all our users? This paper does not completely answer this general question, but focuses on solving the first technical challenge we were facing when trying to build ease.ml. We observe that resource sharing is a critical yet subtle issue in this multi-tenant scenario, as we have to balance between efficiency and fairness. We first formalize the problem that we call multi-tenant model selection, aiming for minimizing the total regret of all users running automatic model selection tasks. We then develop a novel algorithm that combines multi-armed bandits with Bayesian optimization and prove a regret bound under the multi-tenant setting. Finally, we report our evaluation of ease.ml on synthetic data and on two services we are providing to our users, namely, image classification with deep neural networks and binary classification with Azure ML Studio. Our experimental evaluation results show that our proposed solution can be up to 9.8x faster in achieving the same global average accuracy for all users as the two popular heuristics used by our users before ease.ml, and 4.1 x faster than state-of-the-art systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qi:2018:TOE, author = "Jianzhong Qi and Yufei Tao and Yanchuan Chang and Rui Zhang", title = "Theoretically optimal and empirically efficient {R}-trees with strong parallelizability", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "5", pages = "621--634", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177732.3177738", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 17 07:25:04 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The massive amount of data and large variety of data distributions in the big data era call for access methods that are efficient in both query processing and index bulk-loading, and over both practical and worst-case workloads. To address this need, we revisit a classic multidimensional access method --- the R-tree. We propose a novel R-tree packing strategy that produces R-trees with an asymptotically optimal I/O complexity for window queries in the worst case. Our experiments show that the R-trees produced by the proposed strategy are highly efficient on real and synthetic data of different distributions. The proposed strategy is also simple to parallelize, since it relies only on sorting. We propose a parallel algorithm for R-tree bulk-loading based on the proposed packing strategy, and analyze its performance under the massively parallel communication model. Experimental results confirm the efficiency and scalability of the parallel algorithm over large data sets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lin:2018:DAM, author = "Xueling Lin and Lei Chen", title = "Domain-aware multi-truth discovery from conflicting sources", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "5", pages = "635--647", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177732.3177739", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 17 07:25:04 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the Big Data era, truth discovery has served as a promising technique to solve conflicts in the facts provided by numerous data sources. The most significant challenge for this task is to estimate source reliability and select the answers supported by high quality sources. However, existing works assume that one data source has the same reliability on any kinds of entity, ignoring the possibility that a source may vary in reliability on different domains. To capture the influence of various levels of expertise in different domains, we integrate domain expertise knowledge to achieve a more precise estimation of source reliability. We propose to infer the domain expertise of a data source based on its data richness in different domains. We also study the mutual influence between domains, which will affect the inference of domain expertise. Through leveraging the unique features of the multi-truth problem that sources may provide partially correct values of a data item, we assign more reasonable confidence scores to value sets. We propose an integrated Bayesian approach to incorporate the domain expertise of data sources and confidence scores of value sets, aiming to find multiple possible truths without any supervision. Experimental results on two real-world datasets demonstrate the feasibility, efficiency and effectiveness of our approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tian:2018:CAL, author = "Boyu Tian and Jiamin Huang and Barzan Mozafari and Grant Schoenebeck", title = "Contention-aware lock scheduling for transactional databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "5", pages = "648--662", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177732.3177740", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 17 07:25:04 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Lock managers are among the most studied components in concurrency control and transactional systems. However, one question seems to have been generally overlooked: ``When there are multiple lock requests on the same object, which one(s) should be granted first?'' Nearly all existing systems rely on a FIFO (first in, first out) strategy to decide which transaction(s) to grant the lock to. In this paper, however, we show that the lock scheduling choices have significant ramifications on the overall performance of a transactional system. Despite the large body of research on job scheduling outside the database context, lock scheduling presents subtle but challenging requirements that render existing results on scheduling inapt for a transactional database. By carefully studying this problem, we present the concept of contention-aware scheduling, show the hardness of the problem, and propose novel lock scheduling algorithms (LDSF and bLDSF), which guarantee a constant factor approximation of the best scheduling. We conduct extensive experiments using a popular database on both TPC-C and a microbenchmark. Compared to FIFO---the default scheduler in most database systems---our bLDSF algorithm yields up to 300x speedup in overall transaction latency. Alternatively, our LDSF algorithm, which is simpler and achieves comparable performance to bLDSF, has already been adopted by open-source community, and was chosen as the default scheduling strategy in MySQL 8.0.3+", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Patel:2018:QDP, author = "Jignesh M. Patel and Harshad Deshmukh and Jianqiao Zhu and Navneet Potti and Zuyu Zhang and Marc Spehlmann and Hakan Memisoglu and Saket Saurabh", title = "{Quickstep}: a data platform based on the scaling-up approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "6", pages = "663--676", month = feb, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3184470.3184471", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 10 06:50:54 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern servers pack enough storage and computing power that just a decade ago was spread across a modest-sized cluster. This paper presents a prototype system, called Quickstep, to exploit the large amount of parallelism that is packed inside modern servers. Quickstep builds on a vast body of previous methods for organizing data, optimizing, scheduling and executing queries, and brings them together in a single system. Quickstep also includes new query processing methods that go beyond previous approaches. To keep the project focused, the project's initial target is read-mostly in-memory data warehousing workloads in single-node settings. In this paper, we describe the design and implementation of Quickstep for this target application space. We also present experimental results comparing the performance of Quickstep to a number of other systems, demonstrating that Quickstep is often faster than many other contemporary systems, and in some cases faster by orders-of-magnitude. Quickstep is an Apache (incubating) project.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kondylakis:2018:CSB, author = "Haridimos Kondylakis and Niv Dayan and Kostas Zoumpatianos and Themis Palpanas", title = "{Coconut}: a scalable bottom-up approach for building data series indexes", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "6", pages = "677--690", month = feb, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3184470.3184472", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 10 06:50:54 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many modern applications produce massive amounts of data series that need to be analyzed, requiring efficient similarity search operations. However, the state-of-the-art data series indexes that are used for this purpose do not scale well for massive datasets in terms of performance, or storage costs. We pinpoint the problem to the fact that existing summarizations of data series used for indexing cannot be sorted while keeping similar data series close to each other in the sorted order. This leads to two design problems. First, traditional bulk-loading algorithms based on sorting cannot be used. Instead, index construction takes place through slow top-down insertions, which create a non-contiguous index that results in many random I/Os. Second, data series cannot be sorted and split across nodes evenly based on their median value; thus, most leaf nodes are in practice nearly empty. This further slows down query speed and amplifies storage costs. To address these problems, we present Coconut. The first innovation in Coconut is an inverted, sortable data series summarization that organizes data series based on a z-order curve, keeping similar series close to each other in the sorted order. As a result, Coconut is able to use bulk-loading techniques that rely on sorting to quickly build a contiguous index using large sequential disk I/Os. We then explore prefix-based and median-based splitting policies for bottom-up bulk-loading, showing that median-based splitting outperforms the state of the art, ensuring that all nodes are densely populated. Overall, we show analytically and empirically that Coconut dominates the state-of-the-art data series indexes in terms of construction speed, query speed, and storage costs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ammar:2018:DES, author = "Khaled Ammar and Frank McSherry and Semih Salihoglu and Manas Joglekar", title = "Distributed evaluation of subgraph queries using worst-case optimal low-memory dataflows", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "6", pages = "691--704", month = feb, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3184470.3184473", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 10 06:50:54 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the problem of finding and monitoring fixed-size subgraphs in a continually changing large-scale graph. We present the first approach that (i) performs worst-case optimal computation and communication, (ii) maintains a total memory footprint linear in the number of input edges, and (iii) scales down per-worker computation, communication, and memory requirements linearly as the number of workers increases, even on adversarially skewed inputs. Our approach is based on worst-case optimal join algorithms, recast as a data-parallel dataflow computation. We describe the general algorithm and modifications that make it robust to skewed data, prove theoretical bounds on its resource requirements in the massively parallel computing model, and implement and evaluate it on graphs containing as many as 64 billion edges. The underlying algorithm and ideas generalize from finding and monitoring subgraphs to the more general problem of computing and maintaining relational equi-joins over dynamic relations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2018:MFC, author = "Teng Li and Zhiyuan Xu and Jian Tang and Yanzhi Wang", title = "Model-free control for distributed stream data processing using deep reinforcement learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "6", pages = "705--718", month = feb, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3184470.3184474", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 10 06:50:54 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we focus on general-purpose Distributed Stream Data Processing Systems (DSDPSs), which deal with processing of unbounded streams of continuous data at scale distributedly in real or near-real time. A fundamental problem in a DSDPS is the scheduling problem (i.e., assigning workload to workers/machines) with the objective of minimizing average end-to-end tuple processing time. A widely-used solution is to distribute workload evenly over machines in the cluster in a round-robin manner, which is obviously not efficient due to lack of consideration for communication delay. Model-based approaches (such as queueing theory) do not work well either due to the high complexity of the system environment. We aim to develop a novel model-free approach that can learn to well control a DSDPS from its experience rather than accurate and mathematically solvable system models, just as a human learns a skill (such as cooking, driving, swimming, etc). Specifically, we, for the first time, propose to leverage emerging Deep Reinforcement Learning (DRL) for enabling model-free control in DSDPSs; and present design, implementation and evaluation of a novel and highly effective DRL-based control framework, which minimizes average end-to-end tuple processing time by jointly learning the system environment via collecting very limited runtime statistics data and making decisions under the guidance of powerful Deep Neural Networks (DNNs). To validate and evaluate the proposed framework, we implemented it based on a widely-used DSDPS, Apache Storm, and tested it with three representative applications: continuous queries, log stream processing and word count (stream version). Extensive experimental results show (1) Compared to Storm's default scheduler and the state-of-the-art model-based method, the proposed framework reduces average tuple processing by 33.5\% and 14.0\% respectively on average. (2) The proposed framework can quickly reach a good scheduling solution during online learning, which justifies its practicability for online control in DSDPSs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Psallidas:2018:SFG, author = "Fotis Psallidas and Eugene Wu", title = "{Smoke}: fine-grained lineage at interactive speed", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "6", pages = "719--732", month = feb, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3184470.3184475", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 10 06:50:54 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data lineage describes the relationship between individual input and output data items of a workflow and is an integral ingredient for both traditional (e.g., debugging or auditing) and emergent (e.g., explanations or cleaning) applications. The core, long-standing problem that lineage systems need to address---and the main focus of this paper---is to quickly capture lineage across a workflow in order to speed up future queries over lineage. Current lineage systems, however, either incur high lineage capture overheads, high lineage query processing costs, or both. In response, developers resort to manual implementations of applications that, in principal, can be expressed and optimized in lineage terms. This paper describes Smoke, an in-memory database engine that provides both fast lineage capture and lineage query processing. To do so, Smoke tightly integrates the lineage capture logic into physical database operators; stores lineage in efficient lineage representations; and employs optimizations if future lineage queries are known up-front. Our experiments on microbenchmarks and realistic workloads show that Smoke reduces the lineage capture overhead and lineage query costs by multiple orders of magnitude as compared to state-of-the-art alternatives. On real-world applications, we show that Smoke meets the latency requirements of interactive visualizations (e.g., $<$ 150ms) and outperforms hand-written implementations of data profiling primitives.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Idris:2018:CQI, author = "Muhammad Idris and Mart{\'\i}n Ugarte and Stijn Vansummeren and Hannes Voigt and Wolfgang Lehner", title = "Conjunctive queries with inequalities under updates", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "7", pages = "733--745", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3192965.3192966", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 29 08:31:56 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern application domains such as Composite Event Recognition (CER) and real-time Analytics require the ability to dynamically refresh query results under high update rates. Traditional approaches to this problem are based either on the materialization of subresults (to avoid their recomputation) or on the recomputation of subresults (to avoid the space overhead of materialization). Both techniques have recently been shown suboptimal: instead of materializing results and subresults, one can maintain a data structure that supports efficient maintenance under updates and can quickly enumerate the full query output, as well as the changes produced under single updates. Unfortunately, these data structures have been developed only for aggregate-join queries composed of equi-joins, limiting their applicability in domains such as CER where temporal joins are commonplace. In this paper, we present a new approach for dynamically evaluating queries with multi-way $ \theta $-joins under updates that is effective in avoiding both materialization and recomputation of results, while supporting a wide range of applications. To do this we generalize Dynamic Yannakakis, an algorithm for dynamically processing acyclic equi-join queries. In tandem, and of independent interest, we generalize the notions of acyclicity and free-connexity to arbitrary $ \theta $-joins. We instantiate our framework to the case where $ \theta $-joins are only composed of equalities and inequalities ($<$, $ \leq $, $=$, $>$, $ \geq $) and experimentally compare this algorithm, called IEDyn, to state of the art CER systems as well as incremental view maintenance engines. IEDyn performs consistently better than the competitor systems with up to two orders of magnitude improvements in both time and memory consumption.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yint:2018:BER, author = "Zhicheng Yint and Jin Sun and Ming Li and Jaliya Ekanayake and Haibo Lin and Marc Friedman and Jos{\'e} A. Blakeley and Clemens Szyperski and Nikhil R. Devanur", title = "Bubble execution: resource-aware reliable analytics at cloud scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "7", pages = "746--758", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3192965.3192967", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 29 08:31:56 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Enabling interactive data exploration at cloud scale requires minimizing end-to-end query execution latency, while guaranteeing fault tolerance, and query execution under resource-constraints. Typically, such a query execution involves orchestrating the execution of hundreds or thousands of related tasks on cloud scale clusters. Without any resource constraints, all query tasks can be scheduled to execute simultaneously (gang scheduling) while connected tasks stream data between them. When the data size referenced by a query increases, gang scheduling may be resource-wasteful or un-satisfiable with a limited, per-query resource budget. This paper introduces Bubble Execution, a new query processing framework for interactive workloads at cloud scale, that balances cost-based query optimization, fault tolerance, optimal resource management, and execution orchestration. Bubble execution involves dividing a query execution graph into a collection of query sub-graphs (bubbles), and scheduling them within a per-query resource budget. The query operators (tasks) inside a bubble stream data between them while fault tolerance is handled by persisting temporary results at bubble boundaries. Our implementation enhances our JetScope service, for interactive workloads, deployed in production clusters at Microsoft. Experiments with TPC-H queries show that bubble execution can reduce resource usage significantly in the presence of failures while maintaining performance competitive with gang execution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kruse:2018:EDA, author = "Sebastian Kruse and Felix Naumann", title = "Efficient discovery of approximate dependencies", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "7", pages = "759--772", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3192965.3192968", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 29 08:31:56 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Functional dependencies (FDs) and unique column combinations (UCCs) form a valuable ingredient for many data management tasks, such as data cleaning, schema recovery, and query optimization. Because these dependencies are unknown in most scenarios, their automatic discovery has been well researched. However, existing methods mostly discover only exact dependencies, i.e., those without violations. Real-world dependencies, in contrast, are frequently approximate due to data exceptions, ambiguities, or data errors. This relaxation to approximate dependencies renders their discovery an even harder task than the already challenging exact dependency discovery. To this end, we propose the novel and highly efficient algorithm Pyro to discover both approximate FDs and approximate UCCs. Pyro combines a separate-and-conquer search strategy with sampling-based guidance that quickly detects dependency candidates and verifies them. In our broad experimental evaluation, Pyro outperforms existing discovery algorithms by a factor of up to 33, scales to larger datasets, and at the same time requires the least main memory.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2018:RID, author = "Yue Wang and Alexandra Meliou and Gerome Miklau", title = "{RC-index}: diversifying answers to range queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "7", pages = "773--786", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3192965.3192969", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 29 08:31:56 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query result diversification is widely used in data exploration, Web search, and recommendation systems. The problem of returning diversified query results consists of finding a small subset of valid query answers that are representative and different from one another, usually quantified by a diversity score. Most existing techniques for query diversification first compute all valid query results and then find a diverse subset. These techniques are inefficient when the set of valid query results is large. Other work has proposed efficient solutions for restricted application settings, where results are shared across multiple queries. In this paper, our goal is to support result diversification for general range queries over a single relation. We propose the RC-Index, a novel index structure that achieves efficiency by reducing the number of items that must be retrieved by the database to form a diverse set of the desired size (about 1 second for a dataset of 1 million items). Further, we prove that an RC-Index offers strong approximation guarantees. To the best of our knowledge, this is the first index-based diversification method with a guaranteed approximation ratio for range queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ding:2018:UUP, author = "Xin Ding and Lu Chen and Yunjun Gao and Christian S. Jensen and Hujun Bao", title = "{UlTraMan}: a unified platform for big trajectory data management and analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "7", pages = "787--799", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3192965.3192970", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 29 08:31:56 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Massive trajectory data is being generated by GPS-equipped devices, such as cars and mobile phones, which is used increasingly in transportation, location-based services, and urban computing. As a result, a variety of methods have been proposed for trajectory data management and analytics. However, traditional systems and methods are usually designed for very specific data management or analytics needs, which forces users to stitch together heterogeneous systems to analyze trajectory data in an inefficient manner. Targeting the overall data pipeline of big trajectory data management and analytics, we present a unified platform, termed as UlTraMan. In order to achieve scalability, efficiency, persistence, and flexibility, (i) we extend Apache Spark with respect to both data storage and computing by seamlessly integrating a key--value store, and (ii) we enhance the MapReduce paradigm to allow flexible optimizations based on random data access. We study the resulting system's flexibility using case studies on data retrieval, aggregation analyses, and pattern mining. Extensive experiments on real and synthetic trajectory data are reported to offer insight into the scalability and performance of UlTraMan.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jindal:2018:SSM, author = "Alekh Jindal and Konstantinos Karanasos and Sriram Rao and Hiren Patel", title = "Selecting subexpressions to materialize at datacenter scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "7", pages = "800--812", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3192965.3192971", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 29 08:31:56 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We observe significant overlaps in the computations performed by user jobs in modern shared analytics clusters. Na{\"\i}vely computing the same subexpressions multiple times results in wasting cluster resources and longer execution times. Given that these shared cluster workloads consist of tens of thousands of jobs, identifying overlapping computations across jobs is of great interest to both cluster operators and users. Nevertheless, existing approaches support orders of magnitude smaller workloads or employ heuristics with limited effectiveness. In this paper, we focus on the problem of subexpression selection for large workloads, i.e., selecting common parts of job plans and materializing them to speed-up the evaluation of subsequent jobs. We provide an ILP-based formulation of our problem and map it to a bipartite graph labeling problem. Then, we introduce BigSubs, a vertex-centric graph algorithm to iteratively choose in parallel which subexpressions to materialize and which subexpressions to use for evaluating each job. We provide a distributed implementation of our approach using our internal SQL-like execution framework, SCOPE, and assess its effectiveness over production workloads. BigSubs supports workloads with tens of thousands of jobs, yielding savings of up to 40\% in machine-hours. We are currently integrating our techniques with the SCOPE runtime in our production clusters.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nargesian:2018:TUS, author = "Fatemeh Nargesian and Erkang Zhu and Ken Q. Pu and Ren{\'e}e J. Miller", title = "Table union search on open data", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "7", pages = "813--825", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3192965.3192973", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 29 08:31:56 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We define the table union search problem and present a probabilistic solution for finding tables that are unionable with a query table within massive repositories. Two tables are unionable if they share attributes from the same domain. Our solution formalizes three statistical models that describe how unionable attributes are generated from set domains, semantic domains with values from an ontology, and natural language domains. We propose a data-driven approach that automatically determines the best model to use for each pair of attributes. Through a distribution-aware algorithm, we are able to find the optimal number of attributes in two tables that can be unioned. To evaluate accuracy, we created and open-sourced a benchmark of Open Data tables. We show that our table union search outperforms in speed and accuracy existing algorithms for finding related tables and scales to provide efficient search over Open Data repositories containing more than one million attributes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2018:STH, author = "Jianfei Chen and Jun Zhu and Jie Lu and Shixia Liu", title = "Scalable training of hierarchical topic models", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "7", pages = "826--839", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3192965.3192972", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 29 08:31:56 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large-scale topic models serve as basic tools for feature extraction and dimensionality reduction in many practical applications. As a natural extension of flat topic models, hierarchical topic models (HTMs) are able to learn topics of different levels of abstraction, which lead to deeper understanding and better generalization than their flat counterparts. However, existing scalable systems for flat topic models cannot handle HTMs, due to their complicated data structures such as trees and concurrent dynamically growing matrices, as well as their susceptibility to local optima. In this paper, we study the hierarchical latent Dirichlet allocation (hLDA) model which is a powerful nonparametric Bayesian HTM. We propose an efficient partially collapsed Gibbs sampling algorithm for hLDA, as well as an initialization strategy to deal with local optima introduced by tree-structured models. We also identify new system challenges in building scalable systems for HTMs, and propose efficient data layout for vectorizing HTM as well as distributed data structures including dynamic matrices and trees. Empirical studies show that our system is 87 times more efficient than the previous open-source implementation for hLDA, and can scale to thousands of CPU cores. We demonstrate our scalability on a 131-million-document corpus with 28 billion tokens, which is 4--5 orders of magnitude larger than previously used corpus. Our distributed implementation can extract 1,722 topics from the corpus with 50 machines in just 7 hours.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Coskun:2018:IFN, author = "Mustafa Coskun and Ananth Grama and Mehmet Koyut{\"u}rk", title = "Indexed fast network proximity querying", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "8", pages = "840--852", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3204028.3204029", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 29 08:31:56 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Node proximity queries are among the most common operations on network databases. A common measure of node proximity is random walk based proximity, which has been shown to be less susceptible to noise and missing data. Real-time processing of random-walk based proximity queries poses significant computational challenges for larger graphs with over billions of nodes and edges, since it involves solution of large linear systems of equations. Due to the importance of this operation, significant effort has been devoted to developing efficient methods for random-walk based node proximity computations. These methods either aim to speed up iterative computations by exploiting numerical properties of random walks, or rely on computation and storage of matrix inverses to avoid computation during query processing. Although both approaches have been well studied, the speedup achieved by iterative approaches does not translate to real-time query processing, and the storage requirements of inversion-based approaches prohibit their use on very large graph databases. We present a novel approach to significantly reducing the computational cost of random walk based node proximity queries with scalable indexing. Our approach combines domain graph-partitioning based indexing with fast iterative computations during query processing using Chebyshev polynomials over the complex elliptic plane. This approach combines the query processing benefits of inversion techniques with the memory and storage benefits of iterative approache. Using real-world networks with billions of nodes and edges, and top- k proximity queries as the benchmark problem, we show that our algorithm, I-C hopper, significantly outperforms existing methods. Specifically, it drastically reduces convergence time of the iterative procedure, while also reducing storage requirements for indexing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zheng:2018:ODP, author = "Libin Zheng and Lei Chen and Jieping Ye", title = "Order dispatch in price-aware ridesharing", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "8", pages = "853--865", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3204028.3204030", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 29 08:31:56 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the prevalence of car-hailing applications, ridesharing becomes more and more popular because of its great potential in monetary saving and environmental protection. Order dispatch is the key problem in ridesharing, which has a strong impact on riders' experience and platform's performance. Existing order dispatch research works fail to consider the price of the orders, which can be an important reference because it directly relates to the platform's profit. Our work takes the order price into concern, and formulates a constrained optimization problem, which takes platform's profit as the optimization objective and performs controls on riders' detour distance and waiting time. We prove the problem is NP-hard, thus, we propose approximation methods. We further develop a simulation framework based on real ridesharing order and vehicle data. We conduct experiments with this simulation framework to evaluate the effectiveness and efficiency of the proposed methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mouratidis:2018:EPU, author = "Kyriakos Mouratidis and Bo Tang", title = "Exact processing of uncertain top-$k$ queries in multi-criteria settings", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "8", pages = "866--879", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3204028.3204031", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 29 08:31:56 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Traditional rank-aware processing assumes a dataset that contains available options to cover a specific need (e.g., restaurants, hotels, etc) and users who browse that dataset via top- k queries with linear scoring functions, i.e., by ranking the options according to the weighted sum of their attributes, for a set of given weights. In practice, however, user preferences (weights) may only be estimated with bounded accuracy, or may be inherently uncertain due to the inability of a human user to specify exact weight values with absolute accuracy. Motivated by this, we introduce the uncertain top-k query ( UTK ). Given uncertain preferences, that is, an approximate description of the weight values, the UTK query reports all options that may belong to the top- k set. A second version of the problem additionally reports the exact top- k set for each of the possible weight settings. We develop a scalable processing framework for both UTK versions, and demonstrate its efficiency using standard benchmark datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Berti-Equille:2018:DGF, author = "Laure Berti-{\'E}quille and Hazar Harmouch and Felix Naumann and No{\"e}l Novelli and Saravanan Thirumuruganathan", title = "Discovery of genuine functional dependencies from relational data with missing values", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "8", pages = "880--892", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3204028.3204032", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 29 08:31:56 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Functional dependencies (FDs) play an important role in maintaining data quality. They can be used to enforce data consistency and to guide repairs over a database. In this work, we investigate the problem of missing values and its impact on FD discovery. When using existing FD discovery algorithms, some genuine FDs could not be detected precisely due to missing values or some non-genuine FDs can be discovered even though they are caused by missing values with a certain NULL semantics. We define a notion of genuineness and propose algorithms to compute the genuineness score of a discovered FD. This can be used to identify the genuine FDs among the set of all valid dependencies that hold on the data. We evaluate the quality of our method over various real-world and semi-synthetic datasets with extensive experiments. The results show that our method performs well for relatively large FD sets and is able to accurately capture genuine FDs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cai:2018:ETD, author = "Qingchao Cai and Zhongle Xie and Meihui Zhang and Gang Chen and H. V. Jagadish and Beng Chin Ooi", title = "Effective temporal dependence discovery in time series data", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "8", pages = "893--905", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3204028.3204033", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 29 08:31:56 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To analyze user behavior over time, it is useful to group users into cohorts, giving rise to cohort analysis. We identify several crucial limitations of current cohort analysis, motivated by the unmet need for temporal dependence discovery. To address these limitations, we propose a generalization that we call recurrent cohort analysis. We introduce a set of operators for recurrent cohort analysis and design access methods specific to these operators in both single-node and distributed environments. Through extensive experiments, we show that recurrent cohort analysis when implemented using the proposed access methods is up to six orders faster than one implemented as a layer on top of a database in a single-node setting, and two orders faster than one implemented using Spark SQL in a distributed setting.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Arora:2018:HIP, author = "Akhil Arora and Sakshi Sinha and Piyush Kumar and Arnab Bhattacharya", title = "{HD-index}: pushing the scalability-accuracy boundary for approximate {kNN} search in high-dimensional spaces", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "8", pages = "906--919", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3204028.3204034", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 29 08:31:56 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Nearest neighbor searching of large databases in high-dimensional spaces is inherently difficult due to the curse of dimensionality. A flavor of approximation is, therefore, necessary to practically solve the problem of nearest neighbor search. In this paper, we propose a novel yet simple indexing scheme, HD-Index, to solve the problem of approximate k-nearest neighbor queries in massive high-dimensional databases. HD-Index consists of a set of novel hierarchical structures called RDB-trees built on Hilbert keys of database objects. The leaves of the RDB-trees store distances of database objects to reference objects, thereby allowing efficient pruning using distance filters. In addition to triangular inequality, we also use Ptolemaic inequality to produce better lower bounds. Experiments on massive (up to billion scale) high-dimensional (up to 1000+) datasets show that HD-Index is effective, efficient, and scalable.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ahmad:2018:LSL, author = "Yousuf Ahmad and Omar Khattab and Arsal Malik and Ahmad Musleh and Mohammad Hammoud and Mucahid Kutlu and Mostafa Shehata and Tamer Elsayed", title = "{LA3}: a scalable link- and locality-aware linear algebra-based graph analytics system", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "8", pages = "920--933", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3204028.3204035", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 29 08:31:56 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents LA3, a scalable distributed system for graph analytics. LA3 couples a vertex-based programming model with a highly optimized linear algebra-based engine. It translates any vertex-centric program into an iteratively executed sparse matrix-vector multiplication (SpMV). To reduce communication and enhance scalability, the adjacency matrix representing an input graph is partitioned into locality-aware 2D tiles distributed across multiple processes. Alongside, three major optimizations are incorporated to preclude redundant computations and minimize communication. First, the link-based structure of the input graph is exploited to classify vertices into different types. Afterwards, vertices of special types are factored out of the main loop of the graph application to avoid superfluous computations. We refer to this novel optimization as computation filtering. Second, a communication filtering mechanism is involved to optimize for the high sparsity of the input matrix due to power-law distributions, common in real-world graphs. This optimization ensures that each process receives only the messages that pertain to non-zero entries in its tiles, substantially reducing communication traffic since most tiles are highly sparse. Lastly, a pseudo-asynchronous computation and communication optimization is proposed, whereby processes progress and communicate asynchronously, consume messages as soon as they become available, and block otherwise. We implemented and extensively tested LA3 on private and public clouds. Results show that LA3 outperforms six related state-of-the-art and popular distributed graph analytics systems by an average of 10X.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2018:TSE, author = "Dongxiang Zhang and Mengting Ding and Dingyu Yang and Yi Liu and Ju Fan and Heng Tao Shen", title = "Trajectory simplification: an experimental study and quality analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "9", pages = "934--946", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3213880.3213885", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 30 09:26:43 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The ubiquitousness of GPS sensors in smart-phones, vehicles and wearable devices has enabled the collection of massive volumes of trajectory data from tracing moving objects. Consequently, an unprecedented scale of timestamped GPS data has been generated and posed an urgent demand for an effective storage mechanism for trajectory databases. The mainstream compression technique is called trajectory simplification, that finds a subsequence to approximate the original trajectory and attempts to minimize the information loss under a distance measure. Even though various simplification algorithms have been proposed in the past decades, there still lacks a thorough comparison to cover all the state-of-the-art algorithms and evaluate their quality using datasets in diversified motion patterns. Hence, it still remains a challenge for GPS data collectors to determine a proper algorithm in a concrete application. In addition, almost the entire line of previous methods uses error-based metrics to evaluate the compression quality, while ignoring their usability in supporting spatio-temporal queries on top of the reduced database. To bridge these gaps, we conduct so far the most comprehensive evaluation on trajectory simplification techniques. We compare the performance of 25 algorithms in total using five real datasets in different motion patterns. According to the experimental findings, we present useful guidance for the selection or development of effective trajectory simplification algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Antenucci:2018:CBE, author = "Dolan Antenucci and Michael Cafarella", title = "Constraint-based explanation and repair of filter-based transformations", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "9", pages = "947--960", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3213880.3213886", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 30 09:26:43 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data analysts often need to transform an existing dataset, such as with filtering, into a new dataset for downstream analysis. Even the most trivial of mistakes in this phase can introduce bias and lead to the formation of invalid conclusions. For example, consider a researcher identifying subjects for trials of a new statin drug. She might identify patients with a high dietary cholesterol intake as a population likely to benefit from the drug, however, selection of these individuals could bias the test population to those with a generally unhealthy lifestyle, thereby compromising the analysis. Reducing the potential for bias in the dataset transformation process can minimize the need to later engage in the tedious, time-consuming process of trying to eliminate bias while preserving the target dataset. We propose a novel interaction model for explain-and-repair data transformation systems, in which users inter-actively define constraints for transformation code and the resultant data. The system satisfies these constraints as far as possible, and provides an explanation for any problems encountered. We present an algorithm that yields filter-based transformation code satisfying user constraints. We implemented and evaluated a prototype of this architecture, E meril, using both synthetic and real-world datasets. Our approach finds solutions 34\% more often and 77\% more quickly than the previous state-of-the-art solution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2018:SSQ, author = "Xiaolan Wang and Aaron Feng and Behzad Golshan and Alon Halevy and George Mihaila and Hidekazu Oiwa and Wang-Chiew Tan", title = "Scalable semantic querying of text", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "9", pages = "961--974", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3213880.3213887", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 30 09:26:43 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present the Koko system that takes declarative information extraction to a new level by incorporating advances in natural language processing techniques in its extraction language. K oko is novel in that its extraction language simultaneously supports conditions on the surface of the text and on the structure of the dependency parse tree of sentences, thereby allowing for more refined extractions. Koko also supports conditions that are forgiving to linguistic variation of expressing concepts and allows to aggregate evidence from the entire document in order to filter extractions. To scale up, K oko exploits a multi-indexing scheme and heuristics for efficient extractions. We extensively evaluate Koko over publicly available text corpora. We show that Koko indices take up the smallest amount of space, are notably faster and more effective than a number of prior indexing schemes. Finally, we demonstrate Koko's scalability on a corpus of 5 million Wikipedia articles.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bellomarini:2018:VSD, author = "Luigi Bellomarini and Emanuel Sallinger and Georg Gottlob", title = "The {Vadalog} system: datalog-based reasoning for knowledge graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "9", pages = "975--987", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3213880.3213888", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 30 09:26:43 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Over the past years, there has been a resurgence of Datalog-based systems in the database community as well as in industry. In this context, it has been recognized that to handle the complex knowledge-based scenarios encountered today, such as reasoning over large knowledge graphs, Datalog has to be extended with features such as existential quantification. Yet, Datalog-based reasoning in the presence of existential quantification is in general undecidable. Many efforts have been made to define decidable fragments. Warded Datalog+/- is a very promising one, as it captures PTIME complexity while allowing ontological reasoning. Yet so far, no implementation of Warded Datalog+/- was available. In this paper we present the Vadalog system, a Datalog-based system for performing complex logic reasoning tasks, such as those required in advanced knowledge graphs. The Vadalog system is Oxford's contribution to the VADA research programme, a joint effort of the universities of Oxford, Manchester and Edinburgh and around 20 industrial partners. As the main contribution of this paper, we illustrate the first implementation of Warded Datalog+/-, a high-performance Datalog+/- system utilizing an aggressive termination control strategy. We also provide a comprehensive experimental evaluation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Medya:2018:NND, author = "Sourav Medya and Sayan Ranu and Jithin Vachery and Ambuj Singh", title = "Noticeable network delay minimization via node upgrades", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "9", pages = "988--1001", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3213880.3213889", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 30 09:26:43 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In several domains, the flow of data is governed by an underlying network. Reduction of delays in end-to-end data flow is an important network optimization task. Reduced delays enable shorter travel times for vehicles in road networks, faster information flow in social networks, and increased rate of packets in communication networks. While techniques for network delay minimization have been proposed, they fail to provide any noticeable reduction in individual data flows. Furthermore, they treat all nodes as equally important, which is often not the case in real-world networks. In this paper, we incorporate these practical aspects and propose a network design problem where the goal is to perform k network upgrades such that it maximizes the number of flows in the network with a noticeable reduction in delay. We show that the problem is NP-hard, APX-hard, and non-submodular. We overcome these computational challenges by designing an importance sampling based algorithm with provable quality guarantees. Through extensive experiments on real and synthetic data sets, we establish that importance sampling imparts up to 1000 times speed-up over the greedy approach, and provides up to 70 times the improvement achieved by the state-of-the-art technique.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Palkar:2018:EEE, author = "Shoumik Palkar and James Thomas and Deepak Narayanan and Pratiksha Thaker and Rahul Palamuttam and Parimajan Negi and Anil Shanbhag and Malte Schwarzkopf and Holger Pirk and Saman Amarasinghe and Samuel Madden and Matei Zaharia", title = "Evaluating end-to-end optimization for data analytics applications in {Weld}", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "9", pages = "1002--1015", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3213880.3213890", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 30 09:26:43 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern analytics applications use a diverse mix of libraries and functions. Unfortunately, there is no optimization across these libraries, resulting in performance penalties as high as an order of magnitude in many applications. To address this problem, we proposed Weld, a common runtime for existing data analytics libraries that performs key physical optimizations such as pipelining under existing, imperative library APIs. In this work, we further develop the Weld vision by designing an automatic adaptive optimizer for Weld applications, and evaluating its impact on realistic data science workloads. Our optimizer eliminates multiple forms of overhead that arise when composing imperative libraries like Pandas and NumPy, and uses lightweight measurements to make data-dependent decisions at run-time in ad-hoc workloads where no statistics are available, with sub-second overhead. We also evaluate which optimizations have the largest impact in practice and whether Weld can be integrated into libraries incrementally. Our results are promising: using our optimizer, Weld accelerates data science workloads by up to 23X on one thread and 80X on eight threads, and its adaptive optimizations provide up to a 3.75X speedup over rule-based optimization. Moreover, Weld provides benefits if even just 4--5 operators in a library are ported to use it. Our results show that common runtime designs like Weld may be a viable approach to accelerate analytics.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Muller:2018:ISE, author = "Magnus M{\"u}ller and Guido Moerkotte and Oliver Kolb", title = "Improved selectivity estimation by combining knowledge from sampling and synopses", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "9", pages = "1016--1028", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3213880.3213882", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 30 09:26:43 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Estimating selectivities remains a critical task in query processing. Optimizers rely on the accuracy of selectivities when generating execution plans and, in approximate query answering, estimated selectivities affect the quality of the result. Many systems maintain synopses, e.g., histograms, and, in addition, provide sampling facilities. In this paper, we present a novel approach to combine knowledge from synopses and sampling for the purpose of selectivity estimation for conjunctive queries. We first show how to extract information from synopses and sampling such that they are mutually consistent. In a second step, we show how to combine them and decide on an admissible selectivity estimate. We compare our approach to state-of-the-art methods and evaluate the strengths and limitations of each approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Han:2018:EAA, author = "Kai Han and Keke Huang and Xiaokui Xiao and Jing Tang and Aixin Sun and Xueyan Tang", title = "Efficient algorithms for adaptive influence maximization", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "9", pages = "1029--1040", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3213880.3213883", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 30 09:26:43 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a social network $G$, the influence maximization (IM) problem seeks a set $S$ of $k$ seed nodes in $G$ to maximize the expected number of nodes activated via an influence cascade starting from $S$. Although a lot of algorithms have been proposed for IM, most of them only work under the non-adaptive setting, i.e., when all $k$ seed nodes are selected before we observe how they influence other users. In this paper, we study the adaptive IM problem, where we select the $k$ seed nodes in batches of equal size $b$, such that the choice of the $i$-th batch can be made after the influence results of the first $ i - 1$ batches are observed. We propose the first practical algorithms for adaptive IM with an approximation guarantee of $ 1 - \exp (\xi - 1)$ for $ b = 1$ and $ 1 - \exp (\xi - 1 + 1 / e)$ for $ b > 1$, where $ \xi $ is any number in $ (0, 1)$. Our approach is based on a novel AdaptGreedy framework instantiated by non-adaptive IM algorithms, and its performance can be substantially improved if the non-adaptive IM algorithm has a small expected approximation error. However, no current non-adaptive IM algorithms provide such a desired property. Therefore, we further propose a non-adaptive IM algorithm called EPIC, which not only has the same worst-case performance bounds with that of the state-of-the-art non-adaptive IM algorithms, but also has a reduced expected approximation error. We also provide a theoretical analysis to quantify the performance gain brought by instantiating AdaptGreedy using EPIC, compared with a naive approach using the existing IM algorithms. Finally, we use real social networks to evaluate the performance of our approach through extensive experiments, and the experimental experiments strongly corroborate the superiorities of our approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Breslow:2018:MFF, author = "Alex D. Breslow and Nuwan S. Jayasena", title = "{Morton} filters: faster, space-efficient cuckoo filters via biasing, compression, and decoupled logical sparsity", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "9", pages = "1041--1055", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3213880.3213884", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 30 09:26:43 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Approximate set membership data structures (ASMDSs) are ubiquitous in computing. They trade a tunable, often small, error rate ($ \epsilon $) for large space savings. The canonical ASMDS is the Bloom filter, which supports lookups and insertions but not deletions in its simplest form. Cuckoo filters (CFs), a recently proposed class of ASMDSs, add deletion support and often use fewer bits per item for equal $ \epsilon $. This work introduces the Morton filter (MF), a novel AS-MDS that introduces several key improvements to CFs. Like CFs, MFs support lookups, insertions, and deletions, but improve their respective throughputs by 1.3x to 2.5x, 0.9x to 15.5x, and 1.3x to 1.6x. MFs achieve these improvements by (1) introducing a compressed format that permits a logically sparse filter to be stored compactly in memory, (2) leveraging succinct embedded metadata to prune unnecessary memory accesses, and (3) heavily biasing insertions to use a single hash function. With these optimizations, lookups, insertions, and deletions often only require accessing a single hardware cache line from the filter. These improvements are not at a loss in space efficiency, as MFs typically use comparable to slightly less space than CFs for the same epsis;.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bi:2018:OPA, author = "Fei Bi and Lijun Chang and Xuemin Lin and Wenjie Zhang", title = "An optimal and progressive approach to online search of top-$k$ influential communities", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "9", pages = "1056--1068", month = may, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3213880.3213881", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 30 09:26:43 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Community search over large graphs is a fundamental problem in graph analysis. Recent studies propose to compute top- k influential communities, where each reported community not only is a cohesive subgraph but also has a high influence value. The existing approaches to the problem of top- k influential community search can be categorized as index-based algorithms and online search algorithms without indexes. The index-based algorithms, although being very efficient in conducting community searches, need to pre-compute a special-purpose index and only work for one built-in vertex weight vector. In this paper, we investigate online search approaches and propose an instance-optimal algorithm LocalSearch whose time complexity is linearly proportional to the size of the smallest subgraph that a correct algorithm needs to access without indexes. In addition, we also propose techniques to make LocalSearch progressively compute and report the communities in decreasing influence value order such that k does not need to be specified. Moreover, we extend our framework to the general case of top- k influential community search regarding other cohesiveness measures. Extensive empirical studies on real graphs demonstrate that our algorithms outperform the existing online search algorithms by several orders of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Meister:2018:EAT, author = "Andreas Meister and Guido Moerkotte and Gunter Saake", title = "Errata for {``Analysis of two existing and one new dynamic programming algorithm for the generation of optimal bushy join trees without cross products''}", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1069--1070", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231756", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the published version of EnumerateCmp in the Section 3.3 on Page 936 [1], see also Algorithm 1, a small error is included in Line 5. In the first call of EnumerateCsgRec, too many nodes $ (X \cup N) $ will be excluded for the emission of complements, leading to the fact that, in general, not all complements will be emitted correctly.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Park:2018:DSB, author = "Noseong Park and Mahmoud Mohammadi and Kshitij Gorde and Sushil Jajodia and Hongkyu Park and Youngmin Kim", title = "Data synthesis based on generative adversarial networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1071--1083", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231757", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Privacy is an important concern for our society where sharing data with partners or releasing data to the public is a frequent occurrence. Some of the techniques that are being used to achieve privacy are to remove identifiers, alter quasi-identifiers, and perturb values. Unfortunately, these approaches suffer from two limitations. First, it has been shown that private information can still be leaked if attackers possess some background knowledge or other information sources. Second, they do not take into account the adverse impact these methods will have on the utility of the released data. In this paper, we propose a method that meets both requirements. Our method, called table-GAN, uses generative adversarial networks (GANs) to synthesize fake tables that are statistically similar to the original table yet do not incur information leakage. We show that the machine learning models trained using our synthetic tables exhibit performance that is similar to that of models trained using the original table for unknown testing cases. We call this property model compatibility. We believe that anonymization/perturbation/synthesis methods without model compatibility are of little value. We used four real-world datasets from four different domains for our experiments and conducted in-depth comparisons with state-of-the-art anonymization, perturbation, and generation techniques. Throughout our experiments, only our method consistently shows balance between privacy level and model compatibility.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lockard:2018:CDS, author = "Colin Lockard and Xin Luna Dong and Arash Einolghozati and Prashant Shiralkar", title = "{CERES}: distantly supervised relation extraction from the semi-structured web", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1084--1096", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231758", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The web contains countless semi-structured websites, which can be a rich source of information for populating knowledge bases. Existing methods for extracting relations from the DOM trees of semi-structured webpages can achieve high precision and recall only when manual annotations for each website are available. Although there have been efforts to learn extractors from automatically generated labels, these methods are not sufficiently robust to succeed in settings with complex schemas and information-rich websites. In this paper we present a new method for automatic extraction from semi-structured websites based on distant supervision. We automatically generate training labels by aligning an existing knowledge base with a website and leveraging the unique structural characteristics of semi-structured websites. We then train a classifier based on the potentially noisy and incomplete labels to predict new relation instances. Our method can compete with annotation-based techniques in the literature in terms of extraction quality. A large-scale experiment on over 400,000 pages from dozens of multi-lingual long-tail websites harvested 1.25 million facts at a precision of 90\%.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nazi:2018:EEI, author = "Azade Nazi and Bolin Ding and Vivek Narasayya and Surajit Chaudhuri", title = "Efficient estimation of inclusion coefficient using hyperloglog sketches", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1097--1109", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231759", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Efficiently estimating the inclusion coefficient --- the fraction of values of one column that are contained in another column --- is useful for tasks such as data profiling and foreign-key detection. We present a new estimator, BML, for inclusion coefficient based on Hyperloglog sketches that results in significantly lower error compared to the state-of-the art approach that uses Bottom-k sketches. We evaluate the error of the BML estimator using experiments on industry benchmarks such as TPC-H and TPC-DS, and several real-world databases. As an independent contribution, we show how Hyperloglog sketches can be maintained incrementally with data deletions using only a constant amount of additional memory.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fier:2018:SSJ, author = "Fabian Fier and Nikolaus Augsten and Panagiotis Bouros and Ulf Leser and Johann-Christoph Freytag", title = "Set similarity joins on {MapReduce}: an experimental survey", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1110--1122", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231760", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Set similarity joins, which compute pairs of similar sets, constitute an important operator primitive in a variety of applications, including applications that must process large amounts of data. To handle these data volumes, several distributed set similarity join algorithms have been proposed. Unfortunately, little is known about the relative performance, strengths and weaknesses of these techniques. Previous comparisons are limited to a small subset of relevant algorithms, and the large differences in the various test setups make it hard to draw overall conclusions. In this paper we survey ten recent, distributed set similarity join algorithms, all based on the MapReduce paradigm. We empirically compare the algorithms in a uniform test environment on twelve datasets that expose different characteristics and represent a broad range of applications. Our experiments yield a surprising result: All algorithms in our test fail to scale for at least one dataset and are sensitive to long sets, frequent set elements, low similarity thresholds, or a combination thereof. Interestingly, some algorithms even fail to handle the small datasets that can easily be processed in a non-distributed setting. Our analytic investigation of the algorithms pinpoints the reasons for the poor performance and targeted experiments confirm our analytic findings. Based on our investigation, we suggest directions for future research in the area.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ding:2018:PSH, author = "Bailu Ding and Sudipto Das and Wentao Wu and Surajit Chaudhuri and Vivek Narasayya", title = "{Plan Stitch}: harnessing the best of many plans", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1123--1136", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231761", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query performance regression due to the query optimizer selecting a bad query execution plan is a major pain point in production workloads. Commercial DBMSs today can automatically detect and correct such query plan regressions by storing previously-executed plans and reverting to a previous plan which is still valid and has the least execution cost. Such reversion-based plan correction has relatively low risk of plan regression since the decision is based on observed execution costs. However, this approach ignores potentially valuable information of efficient subplans collected from other previously-executed plans. In this paper, we propose a novel technique, Plan Stitch, that automatically and opportunistically combines efficient subplans of previously-executed plans into a valid new plan, which can be cheaper than any individual previously-executed plan. We implement Plan Stitch on top of Microsoft SQL Server. Our experiments on TPC-DS benchmark and three real-world customer workloads show that plans obtained via Plan Stitch can reduce execution cost significantly, with a reduction of up to two orders of magnitude in execution cost when compared to reverting to the cheapest previously-executed plan.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2018:FES, author = "Sheng Wang and Tien Tuan Anh Dinh and Qian Lin and Zhongle Xie and Meihui Zhang and Qingchao Cai and Gang Chen and Beng Chin Ooi and Pingcheng Ruan", title = "{Forkbase}: an efficient storage engine for blockchain and forkable applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1137--1150", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231762", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Existing data storage systems offer a wide range of functionalities to accommodate an equally diverse range of applications. However, new classes of applications have emerged, e.g., blockchain and collaborative analytics, featuring data versioning, fork semantics, tamper-evidence or any combination thereof. They present new opportunities for storage systems to efficiently support such applications by embedding the above requirements into the storage. In this paper, we present ForkBase, a storage engine designed for blockchain and forkable applications. By integrating core application properties into the storage, ForkBase not only delivers high performance but also reduces development effort. The storage manages multiversion data and supports two variants of fork semantics which enable different fork workflows. ForkBase is fast and space efficient, due to a novel index class that supports efficient queries as well as effective detection of duplicate content across data objects, branches and versions. We demonstrate ForkBase's performance using three applications: a blockchain platform, a wiki engine and a collaborative analytics application. We conduct extensive experimental evaluation against respective state-of-the-art solutions. The results show that ForkBase achieves superior performance while significantly lowering the development effort.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ammar:2018:EAD, author = "Khaled Ammar and M. Tamer {\"O}zsu", title = "Experimental analysis of distributed graph systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1151--1164", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231764", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper evaluates eight parallel graph processing systems: Hadoop, HaLoop, Vertica, Giraph, GraphLab (PowerGraph), Blogel, Flink Gelly, and GraphX (SPARK) over four very large datasets (Twitter, World Road Network, UK 200705, and ClueWeb) using four workloads (PageRank, WCC, SSSP and K-hop). The main objective is to perform an independent scale-out study by experimentally analyzing the performance, usability, and scalability (using up to 128 machines) of these systems. In addition to performance results, we discuss our experiences in using these systems and suggest some system tuning heuristics that lead to better performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{He:2018:TDE, author = "Yeye He and Xu Chu and Kris Ganjam and Yudian Zheng and Vivek Narasayya and Surajit Chaudhuri", title = "Transform-data-by-example {(TDE)}: an extensible search engine for data transformations", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1165--1177", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231766", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Today, business analysts and data scientists increasingly need to clean, standardize and transform diverse data sets, such as name, address, date time, and phone number, before they can perform analysis. This process of data transformation is an important part of data preparation, and is known to be difficult and time-consuming for end-users. Traditionally, developers have dealt with these longstanding transformation problems using custom code libraries. They have built vast varieties of custom logic for name parsing and address standardization, etc., and shared their source code in places like GitHub. Data transformation would be a lot easier for end-users if they can discover and reuse such existing transformation logic. We developed Transform-Data-by-Example ( TDE ), which works like a search engine for data transformations. TDE ``indexes'' vast varieties of transformation logic in source code, DLLs, web services and mapping tables, so that users only need to provide a few input/output examples to demonstrate a desired transformation, and TDE can interactively find relevant functions to synthesize new programs consistent with all examples. Using an index of 50K functions crawled from GitHub and Stackoverflow, TDE can already handle many common transformations not currently supported by existing systems. On a benchmark with over 200 transformation tasks, TDE generates correct transformations for 72\% tasks, which is considerably better than other systems evaluated. A beta version of TDE for Microsoft Excel is available via Office store. Part of the TDE technology also ships in Microsoft Power BI.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{OKeeffe:2018:FRE, author = "Dan O'Keeffe and Theodoros Salonidis and Peter Pietzuch", title = "{Frontier}: resilient edge processing for the {Internet of Things}", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1178--1191", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231767", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In an edge deployment model, Internet-of-Things (IoT) applications, e.g. for building automation or video surveillance, must process data locally on IoT devices without relying on permanent connectivity to a cloud backend. The ability to harness the combined resources of multiple IoT devices for computation is influenced by the quality of wireless network connectivity. An open challenge is how practical edge-based IoT applications can be realised that are robust to changes in network bandwidth between IoT devices, due to interference and intermittent connectivity. We present Frontier, a distributed and resilient edge processing platform for IoT devices. The key idea is to express data-intensive IoT applications as continuous data-parallel streaming queries and to improve query throughput in an unreliable wireless network by exploiting network path diversity: a query includes operator replicas at different IoT nodes, which increases possible network paths for data. Frontier dynamically routes stream data to operator replicas based on network path conditions. Nodes probe path throughput and use backpressure stream routing to decide on transmission rates, while exploiting multiple operator replicas for data-parallelism. If a node loses network connectivity, a transient disconnection recovery mechanism reprocesses the lost data. Our experimental evaluation of Frontier shows that network path diversity improves throughput by $ 1.3 \times $--$ 2.8 \times $ for different IoT applications, while being resilient to intermittent network connectivity.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Haynes:2018:LDV, author = "Brandon Haynes and Amrita Mazumdar and Armin Alaghi and Magdalena Balazinska and Luis Ceze and Alvin Cheung", title = "{LightDB}: a {DBMS} for virtual reality video", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1192--1205", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231768", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present the data model, architecture, and evaluation of LightDB, a database management system designed to efficiently manage virtual, augmented, and mixed reality (VAMR) video content. VAMR video differs from its two-dimensional counterpart in that it is spherical with periodic angular dimensions, is nonuniformly and continuously sampled, and applications that consume such videos often have demanding latency and throughput requirements. To address these challenges, LightDB treats VAMR video data as a logically-continuous six-dimensional light field. Furthermore, LightDB supports a rich set of operations over light fields, and automatically transforms declarative queries into executable physical plans. We have implemented a prototype of LightDB and, through experiments with VAMR applications in the literature, we find that LightDB offers up to $ 4 \times $ throughput improvements compared with prior work.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{McKenna:2018:OEH, author = "Ryan McKenna and Gerome Miklau and Michael Hay and Ashwin Machanavajjhala", title = "Optimizing error of high-dimensional statistical queries under differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1206--1219", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231769", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Differentially private algorithms for answering sets of predicate counting queries on a sensitive database have many applications. Organizations that collect individual-level data, such as statistical agencies and medical institutions, use them to safely release summary tabulations. However, existing techniques are accurate only on a narrow class of query workloads, or are extremely slow, especially when analyzing more than one or two dimensions of the data. In this work we propose HDMM, a new differentially private algorithm for answering a workload of predicate counting queries, that is especially effective for higher-dimensional datasets. HDMM represents query workloads using an implicit matrix representation and exploits this compact representation to efficiently search (a subset of) the space of differentially private algorithms for one that answers the input query workload with high accuracy. We empirically show that HDMM can efficiently answer queries with lower error than state-of-the-art techniques on a variety of low and high dimensional datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2018:MBM, author = "Yu Liu and Hantian Zhang and Luyuan Zeng and Wentao Wu and Ce Zhang", title = "{MLbench}: benchmarking machine learning services against human experts", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1220--1232", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231770", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern machine learning services and systems are complicated data systems --- the process of designing such systems is an art of compromising between functionality, performance, and quality. Providing different levels of system supports for different functionalities, such as automatic feature engineering, model selection and ensemble, and hyperparameter tuning, could improve the quality, but also introduce additional cost and system complexity. In this paper, we try to facilitate the process of asking the following type of questions: How much will the users lose if we remove the support of functionality x from a machine learning service? Answering this type of questions using existing datasets, such as the UCI datasets, is challenging. The main contribution of this work is a novel dataset, MLBench, harvested from Kaggle competitions. Unlike existing datasets, MLBench contains not only the raw features for a machine learning task, but also those used by the winning teams of Kaggle competitions. The winning features serve as a baseline of best human effort that enables multiple ways to measure the quality of machine learning services that cannot be supported by existing datasets, such as relative ranking on Kaggle and relative accuracy compared with best-effort systems. We then conduct an empirical study using MLBench to understand example machine learning services from Amazon and Microsoft Azure, and showcase how MLBench enables a comparative study revealing the strength and weakness of these existing machine learning services quantitatively and systematically. The full version of this paper can be found at {\tt arxiv.org/abs/1707.09562}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2018:MCL, author = "Lu Chen and Chengfei Liu and Rui Zhou and Jianxin Li and Xiaochun Yang and Bin Wang", title = "Maximum co-located community search in large scale social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1233--1246", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231755", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The problem of k-truss search has been well defined and investigated to find the highly correlated user groups in social networks. But there is no previous study to consider the constraint of users' spatial information in k-truss search, denoted as co-located community search in this paper. The co-located community can serve many real applications. To search the maximum co-located communities efficiently, we first develop an efficient exact algorithm with several pruning techniques. After that, we further develop an approximation algorithm with adjustable accuracy guarantees and explore more effective pruning rules, which can reduce the computational cost significantly. To accelerate the real-time efficiency, we also devise a novel quadtree based index to support the efficient retrieval of users in a region and optimise the search regions with regards to the given query region. Finally, we verify the performance of our proposed algorithms and index using five real datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zalipynis:2018:CDF, author = "Ramon Antonio Rodriges Zalipynis", title = "{ChronosDB}: distributed, file based, geospatial array {DBMS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1247--1261", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231754", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "An array DBMS streamlines large N-d array management. A large portion of such arrays originates from the geospatial domain. The arrays often natively come as raster files while standalone command line tools are one of the most popular ways for processing these files. Decades of development and feedback resulted in numerous feature-rich, elaborate, free and quality-assured tools optimized mostly for a single machine. ChronosDB partially delegates in situ data processing to such tools and offers a formal N-d array data model to abstract from the files and the tools. ChronosDB readily provides a rich collection of array operations at scale and outperforms SciDB by up to $ 75 \times $ on average.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Macke:2018:ASR, author = "Stephen Macke and Yiming Zhang and Silu Huang and Aditya Parameswaran", title = "Adaptive sampling for rapidly matching histograms", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1262--1275", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231753", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In exploratory data analysis, analysts often have a need to identify histograms that possess a specific distribution, among a large class of candidate histograms, e.g., find countries whose income distribution is most similar to that of Greece. This distribution could be a new one that the user is curious about, or a known distribution from an existing histogram visualization. At present, this process of identification is brute-force, requiring the manual generation and evaluation of a large number of histograms. We present FastMatch: an end-to-end approach for interactively retrieving the histogram visualizations most similar to a user-specified target, from a large collection of histograms. The primary technical contribution underlying FastMatch is a probabilistic algorithm, HistSim, a theoretically sound sampling-based approach to identify the top- k closest histograms under $ l_1 $ distance. While HistSim can be used independently, within FastMatch we couple HistSim with a novel system architecture that is aware of practical considerations, employing asynchronous block-based sampling policies. FastMatch obtains near-perfect accuracy with up to $ 35 \times $ speedup over approaches that do not use sampling on several real-world datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Asudeh:2018:LSJ, author = "Abolfazl Asudeh and Azade Nazi and Jees Augustine and Saravanan Thirumuruganathan and Nan Zhang and Gautam Das and Divesh Srivastava", title = "Leveraging similarity joins for signal reconstruction", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1276--1288", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231752", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Signal reconstruction problem (SRP) is an important optimization problem where the objective is to identify a solution to an underdetermined system of linear equations that is closest to a given prior. It has a substantial number of applications in diverse areas including network traffic engineering, medical image reconstruction, acoustics, astronomy and many more. Most common approaches for SRP do not scale to large problem sizes. In this paper, we propose a dual formulation of this problem and show how adapting database techniques developed for scalable similarity joins provides a significant speedup. Extensive experiments on real-world and synthetic data show that our approach produces a significant speedup of up to 20x over competing approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yu:2018:SHC, author = "Xiangyao Yu and Yu Xia and Andrew Pavlo and Daniel Sanchez and Larry Rudolph and Srinivas Devadas", title = "{Sundial}: harmonizing concurrency control and caching in a distributed {OLTP} database management system", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1289--1302", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231763", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed transactions suffer from poor performance due to two major limiting factors. First, distributed transactions suffer from high latency because each of their accesses to remote data incurs a long network delay. Second, this high latency increases the likelihood of contention among distributed transactions, leading to high abort rates and low performance. We present Sundial, an in-memory distributed optimistic concurrency control protocol that addresses these two limitations. First, to reduce the transaction abort rate, Sundial dynamically determines the logical order among transactions at runtime, based on their data access patterns. Sundial achieves this by applying logical leases to each data element, which allows the database to dynamically calculate a transaction's logical commit timestamp. Second, to reduce the overhead of remote data accesses, Sundial allows the database to cache remote data in a server's local main memory and maintains cache coherence. With logical leases, Sundial integrates concurrency control and cache coherence into a simple unified protocol. We evaluate Sundial against state-of-the-art distributed concurrency control protocols. Sundial outperforms the next-best protocol by up to 57\% under high contention. Sundial's caching scheme improves performance by up to $ 4.6 \times $ in workloads with high access skew.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mai:2018:CSP, author = "Luo Mai and Kai Zeng and Rahul Potharaju and Le Xu and Steve Suh and Shivaram Venkataraman and Paolo Costa and Terry Kim and Saravanan Muthukrishnan and Vamsi Kuppa and Sudheer Dhulipalla and Sriram Rao", title = "{Chi}: a scalable and programmable control plane for distributed stream processing systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "10", pages = "1303--1316", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3231751.3231765", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 26 16:31:24 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Stream-processing workloads and modern shared cluster environments exhibit high variability and unpredictability. Combined with the large parameter space and the diverse set of user SLOs, this makes modern streaming systems very challenging to statically configure and tune. To address these issues, in this paper we investigate a novel control-plane design, Chi, which supports continuous monitoring and feedback, and enables dynamic re-configuration. Chi leverages the key insight of embedding control-plane messages in the data-plane channels to achieve a low-latency and flexible control plane for stream-processing systems. Chi introduces a new reactive programming model and design mechanisms to asynchronously execute control policies, thus avoiding global synchronization. We show how this allows us to easily implement a wide spectrum of control policies targeting different use cases observed in production. Large-scale experiments using production workloads from a popular cloud provider demonstrate the flexibility and efficiency of our approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mahajan:2018:RHA, author = "Divya Mahajan and Joon Kyung Kim and Jacob Sacks and Adel Ardalan and Arun Kumar and Hadi Esmaeilzadeh", title = "{In-RDBMS} hardware acceleration of advanced analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1317--1331", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236188", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The data revolution is fueled by advances in machine learning, databases, and hardware design. Programmable accelerators are making their way into each of these areas independently. As such, there is a void of solutions that enables hardware \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kolchinsky:2018:JQO, author = "Ilya Kolchinsky and Assaf Schuster", title = "Join query optimization techniques for complex event processing applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1332--1345", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236189", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Complex event processing (CEP) is a prominent technology used in many modern applications for monitoring and tracking events of interest in massive data streams. CEP engines inspect real-time information flows and attempt to detect combinations of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kolchinsky:2018:EAD, author = "Ilya Kolchinsky and Assaf Schuster", title = "Efficient adaptive detection of complex event patterns", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1346--1359", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236190", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Complex event processing (CEP) is widely employed to detect occurrences of predefined combinations (patterns) of events in massive data streams. As new events are accepted, they are matched using some type of evaluation structure, commonly optimized \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wolf:2018:RMR, author = "Florian Wolf and Michael Brendle and Norman May and Paul R. Willems and Kai-Uwe Sattler and Michael Grossniklaus", title = "Robustness metrics for relational query execution plans", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1360--1372", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236191", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The quality of query execution plans in database systems determines how fast a query can be executed. It has been shown that conventional query optimization still selects sub-optimal or even bad execution plans, due to errors in the cardinality \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zheng:2018:QAK, author = "Weiguo Zheng and Jeffrey Xu Yu and Lei Zou and Hong Cheng", title = "Question answering over knowledge graphs: question understanding via template decomposition", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1373--1386", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236192", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The gap between unstructured natural language and structured data makes it challenging to build a system that supports using natural language to query large knowledge graphs. Many existing methods construct a structured query for the input question \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rammelaere:2018:ERD, author = "Joeri Rammelaere and Floris Geerts", title = "Explaining repaired data with {CFDs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1387--1399", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236193", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many popular data cleaning approaches are rule-based: Constraints are formulated in a logical framework, and data is considered dirty if constraints are violated. These constraints are often discovered from data, but to ascertain their validity, user \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dsilva:2018:AAA, author = "Joseph Vinish D'silva and Florestan {De Moor} and Bettina Kemme", title = "{AIDA}: abstraction for advanced in-database analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1400--1413", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236194", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the tremendous growth in data science and machine learning, it has become increasingly clear that traditional relational database management systems (RDBMS) are lacking appropriate support for the programming paradigms required by such applications,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Agrawal:2018:REC, author = "Divy Agrawal and Sanjay Chawla and Bertty Contreras-Rojas and Ahmed Elmagarmid and Yasser Idris and Zoi Kaoudi and Sebastian Kruse and Ji Lucas and Essam Mansour and Mourad Ouzzani and Paolo Papotti and Jorge-Arnulfo Quian{\'e}-Ruiz and Nan Tang and Saravanan Thirumuruganathan and Anis Troudi", title = "{RHEEM}: enabling cross-platform data processing: may the big data be with you!", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1414--1427", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236195", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Solving business problems increasingly requires going beyond the limits of a single data processing platform (platform for short), such as Hadoop or a DBMS. As a result, organizations typically perform tedious and costly tasks to juggle their code and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cheng:2018:EET, author = "Peng Cheng and Xun Jian and Lei Chen", title = "An experimental evaluation of task assignment in spatial crowdsourcing", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1428--1440", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236196", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently, with the rapid development of mobile devices and the crowdsourcing platforms, the spatial crowdsourcing has attracted much attention from the database community. Specifically, spatial crowdsourcing refers to sending a location-based request to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kumar:2018:EAE, author = "Rohit Kumar and Toon Calders", title = "{2SCENT}: an efficient algorithm for enumerating all simple temporal cycles", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1441--1453", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3269460", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In interaction networks nodes may interact continuously and repeatedly. Not only which nodes interact is important, but also the order in which interactions take place and the patterns they form. These patterns cannot be captured by solely inspecting \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ebraheem:2018:DRT, author = "Muhammad Ebraheem and Saravanan Thirumuruganathan and Shafiq Joty and Mourad Ouzzani and Nan Tang", title = "Distributed representations of tuples for entity resolution", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1454--1467", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3269461", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Despite the efforts in 70+ years in all aspects of entity resolution (ER), there is still a high demand for democratizing ER --- by reducing the heavy human involvement in labeling data, performing feature engineering, tuning parameters, and defining \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hasani:2018:ECA, author = "Sona Hasani and Saravanan Thirumuruganathan and Abolfazl Asudeh and Nick Koudas and Gautam Das", title = "Efficient construction of approximate ad-hoc {ML} models through materialization and reuse", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1468--1481", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3269462", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Machine learning has become an essential toolkit for complex analytic processing. Data is typically stored in large data warehouses with multiple dimension hierarchies. Often, data used for building an ML model are aligned on OLAP hierarchies such as \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chu:2018:AFA, author = "Shumo Chu and Brendan Murphy and Jared Roesch and Alvin Cheung and Dan Suciu", title = "Axiomatic foundations and algorithms for deciding semantic equivalences of {SQL} queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1482--1495", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236200", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Deciding the equivalence of SQL queries is a fundamental problem in data management. As prior work has mainly focused on studying the theoretical limitations of the problem, very few implementations for checking such equivalences exist. In this paper, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Almutairi:2018:HSS, author = "Faisal M. Almutairi and Fan Yang and Hyun Ah Song and Christos Faloutsos and Nicholas Sidiropoulos and Vladimir Zadorozhny", title = "{Homerun}: scalable sparse-spectrum reconstruction of aggregated historical data", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1496--1508", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236201", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recovering a time sequence of events from multiple aggregated and possibly overlapping reports is a major challenge in historical data fusion. The goal is to reconstruct a higher resolution event sequence from a mixture of lower resolution samples as \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kuo:2018:DPH, author = "Yu-Hsuan Kuo and Cho-Chun Chiu and Daniel Kifer and Michael Hay and Ashwin Machanavajjhala", title = "Differentially private hierarchical count-of-counts histograms", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1509--1521", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236202", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We consider the problem of privately releasing a class of queries that we call hierarchical count-of-counts histograms. Count-of-counts histograms partition the rows of an input table into groups (e.g., group of people in the same household), and for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2018:EDA, author = "Feng Zhang and Jidong Zhai and Xipeng Shen and Onur Mutlu and Wenguang Chen", title = "Efficient document analytics on compressed data: method, challenges, algorithms, insights", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1522--1535", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236203", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Today's rapidly growing document volumes pose pressing challenges to modern document analytics, in both space usage and processing time. In this work, we propose the concept of compression-based direct processing to alleviate issues in both dimensions. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Muller:2018:YSW, author = "Tobias M{\"u}ller and Benjamin Dietrich and Torsten Grust", title = "You say 'what', {I} hear 'where' and 'why': (mis-)interpreting {SQL} to derive fine-grained provenance", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1536--1549", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236204", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "SQL declaratively specifies what the desired output of a query is. This work shows that a non-standard interpretation of the SQL semantics can, instead, disclose where a piece of the output originated in the input and why that piece found its way into \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schulz:2018:EDS, author = "Lars-Christian Schulz and David Broneske and Gunter Saake", title = "An eight-dimensional systematic evaluation of optimized search algorithms on modern processors", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1550--1562", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236205", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Searching in sorted arrays of keys is a common task with a broad range of applications. Often searching is part of the performance critical sections of a database query or index access, raising the question what kind of search algorithm to choose and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Trummer:2018:VLT, author = "Immanuel Trummer and Mark Bryan and Ramya Narasimha", title = "Vocalizing large time series efficiently", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1563--1575", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236206", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We vocalize query results for time series data. We describe a holistic approach that integrates query evaluation and vocalization. In particular, we generate only those parts of the query result that are relevant for voice output. We exploit the fact \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Palkar:2018:FBY, author = "Shoumik Palkar and Firas Abuzaid and Peter Bailis and Matei Zaharia", title = "Filter before you parse: faster analytics on raw data with sparser", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1576--1589", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236207", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Exploratory big data applications often run on raw unstructured or semi-structured data formats, such as JSON files or text logs. These applications can spend 80--90\% of their execution time parsing the data. In this paper, we propose a new approach for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Abbas:2018:SGP, author = "Zainab Abbas and Vasiliki Kalavri and Paris Carbone and Vladimir Vlassov", title = "Streaming graph partitioning: an experimental study", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1590--1603", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236208", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph partitioning is an essential yet challenging task for massive graph analysis in distributed computing. Common graph partitioning methods scan the complete graph to obtain structural characteristics offline, before partitioning. However, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cai:2018:EDM, author = "Qingchao Cai and Wentian Guo and Hao Zhang and Divyakant Agrawal and Gang Chen and Beng Chin Ooi and Kian-Lee Tan and Yong Meng Teo and Sheng Wang", title = "Efficient distributed memory management with {RDMA} and caching", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1604--1617", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236209", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent advancements in high-performance networking interconnect significantly narrow the performance gap between intra-node and inter-node communications, and open up opportunities for distributed memory platforms to enforce cache coherency among \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Didona:2018:CCL, author = "Diego Didona and Rachid Guerraoui and Jingjing Wang and Willy Zwaenepoel", title = "Causal consistency and latency optimality: friend or foe?", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1618--1632", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236210", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Causal consistency is an attractive consistency model for geo-replicated data stores. It is provably the strongest model that tolerates network partitions. It avoids the long latencies associated with strong consistency, and, especially when using read-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tong:2018:UAR, author = "Yongxin Tong and Yuxiang Zeng and Zimu Zhou and Lei Chen and Jieping Ye and Ke Xu", title = "A unified approach to route planning for shared mobility", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1633--1646", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236211", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "There has been a dramatic growth of shared mobility applications such as ride-sharing, food delivery and crowdsourced parcel delivery. Shared mobility refers to transportation services that are shared among users, where a central issue is route planning. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gan:2018:MBQ, author = "Edward Gan and Jialin Ding and Kai Sheng Tai and Vatsal Sharan and Peter Bailis", title = "Moment-based quantile sketches for efficient high cardinality aggregation queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1647--1660", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236212", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Interactive analytics increasingly involves querying for quantiles over sub-populations of high cardinality datasets. Data processing engines such as Druid and Spark use mergeable summaries to estimate quantiles, but summary merge times can be a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pandey:2018:HGM, author = "Varun Pandey and Andreas Kipf and Thomas Neumann and Alfons Kemper", title = "How good are modern spatial analytics systems?", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1661--1673", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236213", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Spatial data is pervasive. Large amount of spatial data is produced every day from GPS-enabled devices such as cell phones, cars, sensors, and various consumer based applications such as Uber, location-tagged posts in Facebook, Instagram, Snapchat, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rong:2018:LSH, author = "Kexin Rong and Clara E. Yoon and Karianne J. Bergen and Hashem Elezabi and Peter Bailis and Philip Levis and Gregory C. Beroza", title = "Locality-sensitive hashing for earthquake detection: a case study of scaling data-driven science", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1674--1687", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236214", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this work, we report on a novel application of Locality Sensitive Hashing (LSH) to seismic data at scale. Based on the high waveform similarity between reoccurring earthquakes, our application identifies potential earthquakes by searching for similar \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jensen:2018:MMM, author = "S{\o}ren Kejser Jensen and Torben Bach Pedersen and Christian Thomsen", title = "{ModelarDB}: modular model-based time series management with {Spark} and {Cassandra}", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1688--1701", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236215", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Industrial systems, e.g., wind turbines, generate big amounts of data from reliable sensors with high velocity. As it is unfeasible to store and query such big amounts of data, only simple aggregates are currently stored. However, aggregates remove \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jonathan:2018:ECA, author = "Christopher Jonathan and Umar Farooq Minhas and James Hunter and Justin Levandoski and Gor Nishanov", title = "Exploiting coroutines to attack the ``killer nanoseconds''", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1702--1714", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236216", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database systems use many pointer-based data structures, including hash tables and B+-trees, which require extensive ``pointer-chasing.'' Each pointer dereference, e.g., during a hash probe or a B+-tree traversal, can result in a CPU cache miss, stalling \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bindschaedler:2018:TIP, author = "Vincent Bindschaedler and Paul Grubbs and David Cash and Thomas Ristenpart and Vitaly Shmatikov", title = "The tao of inference in privacy-protected databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1715--1728", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236217", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To protect database confidentiality even in the face of full compromise while supporting standard functionality, recent academic proposals and commercial products rely on a mix of encryption schemes. The recommendation is to apply strong, semantically \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Demertzis:2018:ESE, author = "Ioannis Demertzis and Rajdeep Talapatra and Charalampos Papamanthou", title = "Efficient searchable encryption through compression", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "11", pages = "1729--1741", month = jul, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3236187.3236218", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:27 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this work we design new searchable encryption schemes whose goal is to minimize the number of cryptographic operations required to retrieve the result---a dimension mostly overlooked by previous works, yet very important in practice. Our main idea is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2018:CEB, author = "Shen Li and Paul Gerver and John MacMillan and Daniel Debrunner and William Marshall and Kun-Lung Wu", title = "Challenges and experiences in building an efficient {Apache Beam} runner for {IBM} streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1742--1754", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229864", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper describes the challenges and experiences in the development of IBM Streams runner for Apache Beam. Apache Beam is emerging as a common stream programming interface for multiple computing engines. Each participating engine implements a runner. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Boehm:2018:OOF, author = "Matthias Boehm and Berthold Reinwald and Dylan Hutchison and Prithviraj Sen and Alexandre V. Evfimievski and Niketan Pansare", title = "On optimizing operator fusion plans for large-scale machine learning in {systemML}", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1755--1768", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229865", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many machine learning (ML) systems allow the specification of ML algorithms by means of linear algebra programs, and automatically generate efficient execution plans. The opportunities for fused operators---in terms of fused chains of basic operators---. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rehrmann:2018:OCS, author = "Robin Rehrmann and Carsten Binnig and Alexander B{\"o}hm and Kihong Kim and Wolfgang Lehner and Amr Rizk", title = "{OLTPshare}: the case for sharing in {OLTP} workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1769--1780", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229866", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the past, resource sharing has been extensively studied for OLAP workloads. Naturally, the question arises, why studies mainly focus on OLAP and not on OLTP workloads? At first sight, OLTP queries --- due to their short runtime --- may not have enough \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schelter:2018:ALS, author = "Sebastian Schelter and Dustin Lange and Philipp Schmidt and Meltem Celikel and Felix Biessmann and Andreas Grafberger", title = "Automating large-scale data quality verification", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1781--1794", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229867", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern companies and institutions rely on data to guide every single business process and decision. Missing or incorrect information seriously compromises any decision process downstream. Therefore, a crucial, but tedious task for everyone involved in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shacham:2018:TOC, author = "Ohad Shacham and Yonatan Gottesman and Aran Bergman and Edward Bortnikov and Eshcar Hillel and Idit Keidar", title = "Taking {Omid} to the clouds: fast, scalable transactions for real-time cloud analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1795--1808", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229868", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We describe how we evolve Omid, a transaction processing system for Apache HBase, to power Apache Phoenix, a cloud-grade real-time SQL analytics engine. Omid was originally designed for data processing pipelines at Yahoo, which are, by and large, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jacques-Silva:2018:PSJ, author = "Gabriela Jacques-Silva and Ran Lei and Luwei Cheng and Guoqiang Jerry Chen and Kuen Ching and Tanji Hu and Yuan Mei and Kevin Wilfong and Rithin Shetty and Serhat Yilmaz and Anirban Banerjee and Benjamin Heintz and Shridar Iyer and Anshul Jaiswal", title = "Providing streaming joins as a service at {Facebook}", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1809--1821", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229869", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Stream processing applications reduce the latency of batch data pipelines and enable engineers to quickly identify production issues. Many times, a service can log data to distinct streams, even if they relate to the same real-world event (e.g., a \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cai:2018:FIL, author = "Le Cai and Jianjun Chen and Jun Chen and Yu Chen and Kuorong Chiang and Marko Dimitrijevic and Yonghua Ding and Yu Dong and Ahmad Ghazal and Jacques Hebert and Kamini Jagtiani and Suzhen Lin and Ye Liu and Demai Ni and Chunfeng Pei and Jason Sun and Yongyan Wang and Li Zhang and Mingyi Zhang and Cheng Zhu", title = "Fusion insight {librA}: {Huawei}'s enterprise cloud data analytics platform", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1822--1834", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229870", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Huawei Fusion Insight Libr A (FI-MPPDB) is a petabyte scale enterprise analytics platform developed by the Huawei data-base group. It started as a prototype more than five years ago, and is now being used by many enterprise customers over the globe, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Samwel:2018:FQD, author = "Bart Samwel and John Cieslewicz and Ben Handy and Jason Govig and Petros Venetis and Chanjun Yang and Keith Peters and Jeff Shute and Daniel Tenedorio and Himani Apte and Felix Weigel and David Wilhite and Jiacheng Yang and Jun Xu and Jiexing Li and Zhan Yuan and Craig Chasseur and Qiang Zeng and Ian Rae and Anurag Biyani and Andrew Harn and Yang Xia and Andrey Gubichev and Amr El-Helw and Orri Erling and Zhepeng Yan and Mohan Yang and Yiqun Wei and Thanh Do and Colin Zheng and Goetz Graefe and Somayeh Sardashti and Ahmed M. Aly and Divy Agrawal and Ashish Gupta and Shiv Venkataraman", title = "{F1} query: declarative querying at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1835--1848", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229871", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "F1 Query is a stand-alone, federated query processing platform that executes SQL queries against data stored in different file-based formats as well as different storage systems at Google (e.g., Bigtable, Spanner, Google Spreadsheets, etc.). F1 Query \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cao:2018:PUL, author = "Wei Cao and Zhenjun Liu and Peng Wang and Sen Chen and Caifeng Zhu and Song Zheng and Yuhui Wang and Guoqing Ma", title = "{PolarFS}: an ultra-low latency and failure resilient distributed file system for shared storage cloud database", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1849--1862", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229872", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "PolarFS is a distributed file system with ultra-low latency and high availability, designed for the POLARDB database service, which is now available on the Alibaba Cloud. PolarFS utilizes a lightweight network stack and I/O stack in user-space, taking \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bortnikov:2018:ABM, author = "Edward Bortnikov and Anastasia Braginsky and Eshcar Hillel and Idit Keidar and Gali Sheffi", title = "{Accordion}: better memory organization for {LSM} key--value stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1863--1875", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229873", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Log-structured merge (LSM) stores have emerged as the technology of choice for building scalable write-intensive key--value storage systems. An LSM store replaces random I/O with sequential I/O by accumulating large batches of writes in a memory store. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qiu:2018:RTC, author = "Xiafei Qiu and Wubin Cen and Zhengping Qian and You Peng and Ying Zhang and Xuemin Lin and Jingren Zhou", title = "Real-time constrained cycle detection in large dynamic graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1876--1888", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229874", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As graph data is prevalent for an increasing number of Internet applications, continuously monitoring structural patterns in dynamic graphs in order to generate real-time alerts and trigger prompt actions becomes critical for many applications. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gurajada:2018:BHM, author = "Aditya Gurajada and Dheren Gala and Fei Zhou and Amit Pathak and Zhan-Feng Ma", title = "{BTrim}: hybrid in-memory database architecture for extreme transaction processing in {VLDBs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1889--1901", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229875", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To address the need for extreme OLTP performance on commodity multi-core hardware supporting large amounts of memory, SAP ASE is re-architected to tightly integrate an In-Memory Row Store (IMRS) within the existing database engine. The IMRS is both a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{S:2018:SSI, author = "Avinesh P. V. S. and Benjamin H{\"a}ttasch and Orkan {\"O}zyurt and Carsten Binnig and Christian M. Meyer", title = "{Sherlock}: a system for interactive summarization of large text collections", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1902--1905", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236220", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "There exists an ever-growing set of data-centric systems that allow data scientists of varying skill levels to interactively manipulate, analyze and explore large structured data sets. However, there are currently not many systems that allow data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Behrens:2018:DFD, author = "Hans Walter Behrens and K. Sel{\c{c}}uk Candan and Xilun Chen and Ashish Gadkari and Yash Garg and Mao-Lin Li and Xinsheng Li and Sicong Liu and Nicholas Martinez and Jiayong Mo and Elliot Nester and Silvestro Poccia and Manjusha Ravindranath and Maria Luisa Sapino", title = "{Datastorm-FE}: a data- and decision-flow and coordination engine for coupled simulation ensembles", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1906--1909", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236221", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data- and model-driven computer simulations are increasingly critical in many application domains. Yet, several critical data challenges remain in obtaining and leveraging simulations in decision making. Simulations may track 100s of parameters, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2018:DOA, author = "Bohan Zhang and Dana Van Aken and Justin Wang and Tao Dai and Shuli Jiang and Jacky Lao and Siyuan Sheng and Andrew Pavlo and Geoffrey J. Gordon", title = "A demonstration of the {OtterTune} automatic database management system tuning service", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1910--1913", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236222", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database management systems (DBMSs) have a plethora of tunable knobs that control almost everything in the system. The performance of a DBMS is highly dependent on these configuration knobs, however, getting this tuning right is hard. Many organizations \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kakoulli:2018:OAT, author = "Elena Kakoulli and Nikolaos D. Karmiris and Herodotos Herodotou", title = "{OctopusFS} in action: tiered storage management for data intensive computing", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1914--1917", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236223", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The continuous improvements in memory, storage devices, and network technologies of commodity hardware introduce new challenges and opportunities in tiered storage management. Whereas past work is exploiting storage tiers in pairs or for specific \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2018:TST, author = "Huan Li and Hua Lu and Feichao Shi and Gang Chen and Ke Chen and Lidan Shou", title = "{TRIPS}: a system for translating raw indoor positioning data into visual mobility semantics", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1918--1921", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236224", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The rapid accumulation of indoor positioning data is increasingly booming the interest in indoor mobility analyses. As a fundamental analysis, it is highly relevant to translate raw indoor positioning data into mobility semantics that describe what, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ke:2018:DPP, author = "Xiangyu Ke and Michelle Teo and Arijit Khan and Vijaya Krishna Yalavarthi", title = "A demonstration of {PERC}: probabilistic entity resolution with crowd errors", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1922--1925", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236225", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper demonstrates PERC --- our system for crowdsourced entity resolution with human errors. Entity Resolution (ER) is a critical step in data cleaning and analytics. Although many machine-based methods existed for ER task, crowdsourcing is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2018:CCP, author = "Guoliang Li and Chengliang Chai and Ju Fan and Xueping Weng and Jian Li and Yudian Zheng and Yuanbing Li and Xiang Yu and Xiaohang Zhang and Haitao Yuan", title = "{CDB}: a crowd-powered database system", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1926--1929", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236226", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Crowd-powered database systems can leverage the crowd's ability to address machine-hard problems, e.g., data integration. Existing crowdsourcing systems adopt the traditional tree model to select a good query plan. However, the tree model can optimize \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chandramouli:2018:FEC, author = "Badrish Chandramouli and Guna Prasaad and Donald Kossmann and Justin Levandoski and James Hunter and Mike Barnett", title = "{FASTER}: an embedded concurrent key--value store for state management", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1930--1933", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236227", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Over the last decade, there has been a tremendous growth in data-intensive applications and services in the cloud. Data is created on a variety of edge sources such as devices, and is processed by cloud applications to gain insights or make decisions. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2018:MSD, author = "Gensheng Zhang and Chengkai Li", title = "{Maverick}: a system for discovering exceptional facts from knowledge graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1934--1937", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236228", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents Maverick, a system for discovering exceptional facts about entities in knowledge graphs. Maverick is built upon a beam-search based algorithmic framework which we proposed in a research paper that is published in SIGMOD 2018. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2018:PPT, author = "Lu Chen and Yunjun Gao and Zixian Liu and Xiaokui Xiao and Christian S. Jensen and Yifan Zhu", title = "{PTrider}: a price-and-time-aware ridesharing system", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1938--1941", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236229", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Ridesharing is popular among travellers because it can reduce their travel costs, and it also holds the potential to reduce travel time, congestion, air pollution, and overall fuel consumption. Existing ridesharing systems (e.g., lyft, uberPOOL) often \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Beheshti:2018:CKL, author = "Amin Beheshti and Boualem Benatallah and Reza Nouri and Alireza Tabebordbar", title = "{CoreKG}: a knowledge lake service", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1942--1945", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236230", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With Data Science continuing to emerge as a powerful differentiator across industries, organisations are now focused on transforming their data into actionable insights. This task is challenging as in today's knowledge-, service-, and cloud-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ortona:2018:RRD, author = "Stefano Ortona and Venkata Vamsikrishna Meduri and Paolo Papotti", title = "{RuDiK}: rule discovery in knowledge bases", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1946--1949", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236231", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "RuDiK is a system for the discovery of declarative rules over knowledge-bases (KBs). RuDiK discovers both positive rules, which identify relationships between entities, e.g., ``if two persons have the same parent, they are siblings'', and negative rules, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Papadakis:2018:RJE, author = "George Papadakis and Leonidas Tsekouras and Emmanouil Thanos and George Giannakopoulos and Themis Palpanas and Manolis Koubarakis", title = "The return of {jedAI}: end-to-end entity resolution for structured and semi-structured data", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1950--1953", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236232", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "JedAI is an Entity Resolution toolkit that can be used in three ways: (i) as an open-source library that combines state-of-the-art methods into a plethora of end-to-end workflows, (ii) as a user-friendly desktop application with a wizardlike interface \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lee:2018:PSA, author = "Seokki Lee and Bertram Lud{\"a}scher and Boris Glavic", title = "Provenance summaries for answers and non-answers", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1954--1957", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236233", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Explaining why an answer is (not) in the result of a query has proven to be of immense importance for many applications. However, why-not provenance, and to a lesser degree also why-provenance, can be very large, even for small input datasets. The \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xin:2018:HEA, author = "Doris Xin and Litian Ma and Jialin Liu and Stephen Macke and Shuchen Song and Aditya Parameswaran", title = "{Helix}: accelerating human-in-the-loop machine learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1958--1961", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236234", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data application developers and data scientists spend an inordinate amount of time iterating on machine learning (ML) workflows---by modifying the data pre-processing, model training, and postprocessing steps---via trial-and-error to achieve the desired \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Siddiqui:2018:SFP, author = "Tarique Siddiqui and Paul Luh and Zesheng Wang and Karrie Karahalios and Aditya Parameswaran", title = "{Shapesearch}: flexible pattern-based querying of trend line visualizations", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1962--1965", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236235", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Finding visualizations with desired patterns is a common goal during data exploration. However, due to the limited expressiveness and flexibility of existing visual analytics systems, pattern-based querying of visualizations has largely been a manual \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xie:2018:PSP, author = "Miao Xie and Sourav S Bhowmick and Hao Su and Gao Cong and Wook-Shin Han", title = "{PANDA}: a system for partial topology-based search on large networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1966--1969", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236236", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A large body of research on subgraph query processing on large networks assumes that a query is posed in the form of a connected graph. Unfortunately, end users in practice may not always have precise knowledge about the topological relationships \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lu:2018:MPT, author = "Wei Lu and Xinyi Zhang and Zhiyu Shui and Zhe Peng and Xiao Zhang and Xiaoyong Du and Hao Huang and Xiaoyu Wang and Anqun Pan and Haixiang Li", title = "{MSQL+}: a plugin toolkit for similarity search under metric spaces in distributed relational database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1970--1973", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236237", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Similarity search is a primitive operation in various database applications. Thus far, a large number of access methods have been proposed to accelerate the similarity query processing. Nonetheless, these methods mostly focus on developing standalone \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sanghi:2018:HDB, author = "Anupam Sanghi and Raghav Sood and Dharmendra Singh and Jayant R. Haritsa and Srikanta Tirthapura", title = "{HYDRA}: a dynamic big data regenerator", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1974--1977", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236238", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A core requirement of database engine testing is the ability to create synthetic versions of the customer's data warehouse at the vendor site. Prior work on synthetic data regeneration suffers from critical limitations with regard to (a) scaling to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jamour:2018:DMM, author = "Fuad Jamour and Ibrahim Abdelaziz and Panos Kalnis", title = "A demonstration of {MAGiQ}: matrix algebra approach for solving {RDF} graph queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1978--1981", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236239", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Existing RDF engines follow one of two design paradigms: relational or graph-based. Such engines are typically designed for specific hardware architectures, mainly CPUs, and are not easily portable to new architectures. Porting an existing engine to a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tan:2018:RRE, author = "Wei Chit Tan and Meihui Zhang and Hazem Elmeleegy and Divesh Srivastava", title = "{REGAL+}: reverse engineering {SPJA} queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1982--1985", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236240", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The goal of query reverse engineering is to re-generate the SQL query that produced a given result from some known database. The problem has many real world applications where users need to better understand the lineage and trustworthiness of various \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Deutch:2018:NNL, author = "Daniel Deutch and Nave Frost and Amir Gilad and Tomer Haimovich", title = "{NLproveNAns}: natural language provenance for non-answers", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1986--1989", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236241", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Natural language (NL) interfaces to databases allow users without technical background to query the database and get the results. Users of such systems may be surprised by the absence of certain expected results. To this end, we propose to demonstrate \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xu:2018:FTD, author = "Chen Xu and Rudi Poepsel Lemaitre and Juan Soto and Volker Markl", title = "Fault-tolerance for distributed iterative dataflows in action", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1990--1993", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236242", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed dataflow systems (DDS) are widely employed in graph processing and machine learning (ML), where many of these algorithms are iterative in nature. Typically, DDS achieve fault-tolerance using checkpointing mechanisms or they exploit \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Abramovitz:2018:QQS, author = "Efrat Abramovitz and Daniel Deutch and Amir Gilad", title = "{QuestPro}: queries in {SPARQL} through provenance", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1994--1997", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236243", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose to demonstrate QuestPro, a prototype interactive system aimed at allowing non-expert users to specify SPARQL queries. Notably, QuestPro makes an extensive use of provenance in deriving the SPARQL queries, in two ways. First, we ask users to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jarovsky:2018:GRS, author = "Ariel Jarovsky and Tova Milo and Slava Novgorodov and Wang-Chiew Tan", title = "{GOLDRUSH}: rule sharing system for fraud detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "1998--2001", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236244", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Fraud detection rules, written by domain experts, are often employed by financial companies to enhance their machine learning-based mechanisms for accurate detection of fraudulent transactions. Accurate rule writing is a challenging task where domain \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Aebeloe:2018:DDP, author = "Christian Aebeloe and Gabriela Montoya and Vinay Setty and Katja Hose", title = "Discovering diversified paths in knowledge bases", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2002--2005", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236245", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Vast amounts of world knowledge is now accessible through Knowledge Graphs (KGs) in RDF format and can be queried using SPARQL. Yet, finding paths between nodes in such graphs is not part of the official SPARQL 1.1 standard; only the simpler \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Junghanns:2018:DDG, author = "Martin Junghanns and Max Kie{\ss}ling and Niklas Teichmann and Kevin G{\'o}mez and Andr{\'e} Petermann and Erhard Rahm", title = "Declarative and distributed graph analytics with {GRADOOP}", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2006--2009", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236246", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate Gradoop, an open source framework that combines and extends features of graph database systems with the benefits of distributed graph processing. Using a rich graph data model and powerful graph \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2018:CFT, author = "J. W. Zhang and Yu Wang and Y. C. Tay", title = "A collaborative framework for tweaking properties in a synthetic dataset", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2010--2013", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236247", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Researchers and developers use benchmarks to compare their algorithms and products. For database systems, a benchmark must have a dataset D. To be application-specific, this dataset D should be empirical. However, a real D may be too small, or too large,\ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jammi:2018:TFI, author = "Manasa Jammi and Jaydeep Sen and Ashish Mittal and Sagar Verma and Vardaan Pahuja and Rema Ananthanarayanan and Pranay Lohia and Hima Karanam and Diptikalyan Saha and Karthik Sankaranarayanan", title = "Tooling framework for instantiating natural language querying system", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2014--2017", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236248", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent times have seen a growing demand for natural language querying (NLQ) interfaces to retrieve information from the structured data sources such as knowledge bases. Using this interface, business users can directly interact with a database without \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2018:KSS, author = "Xiaolan Wang and Jiyu Komiya and Yoshihiko Suhara and Aaron Feng and Behzad Golshan and Alon Halevy and Wang-Chiew Tan", title = "{Koko}: a system for scalable semantic querying of text", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2018--2021", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236249", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "K oko is a declarative information extraction system that incorporates advances in natural language processing techniques in its extraction language. K oko 's extraction language supports simultaneous specification of conditions \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2018:GGC, author = "Jing Wang and Zichen Liu and Shuai Ma and Nikos Ntarmos and Peter Triantafillou", title = "{GC}: a graph caching system for subgraph\slash supergraph queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2022--2025", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236250", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate a graph caching system GC for expediting subgraph/supergraph queries, which are computationally expensive due to the entailed NP-Complete subgraph isomorphism problem. Unlike existing caching systems for fast data access where each cache \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lissandrini:2018:XYP, author = "Matteo Lissandrini and Davide Mottin and Yannis Velegrakis and Themis Palpanas", title = "{X$^2$Q}: your personal example-based graph explorer", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2026--2029", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236251", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Exploring knowledge graphs can be a daunting task for any user, expert or novice. This is due to the complexity of the schema or because they are unfamiliar with the contents of the data, or even because they do not know precisely what they are looking \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chanial:2018:CFC, author = "Camille Chanial and R{\'e}douane Dziri and Helena Galhardas and Julien Leblay and Minh-Huong Le Nguyen and Ioana Manolescu", title = "{Connectionlens}: finding connections across heterogeneous data sources", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2030--2033", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236252", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Nowadays, journalism is facilitated by the existence of large amounts of publicly available digital data sources. In particular, journalists can do investigative work, which typically consists on keyword-based searches over many heterogeneous, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Senellart:2018:PPP, author = "Pierre Senellart and Louis Jachiet and Silviu Maniu and Yann Ramusat", title = "{ProvSQL}: provenance and probability management in {postgreSQL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2034--2037", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236253", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This demonstration showcases ProvSQL, an open-source module for the PostgreSQL database management system that adds support for computation of provenance and probabilities of query results. A large range of provenance formalisms are supported, including \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shang:2018:CDC, author = "Zechao Shang and Will Brackenbury and Aaron J. Elmore and Michael J. Franklin", title = "{CYADB}: a database that covers your ask", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2038--2041", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236254", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data completeness is becoming a significant roadblock in data quality. Existing research in this area currently handles the certainty of a query by ignoring the incomplete part and approximating missing attributes on partially complete tuples, but \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Govind:2018:CHC, author = "Yash Govind and Erik Paulson and Palaniappan Nagarajan and Paul Suganthan G. C. and AnHai Doan and Youngchoon Park and Glenn M. Fung and Devin Conathan and Marshall Carter and Mingju Sun", title = "{Cloudmatcher}: a hands-off cloud\slash crowd service for entity matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2042--2045", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236255", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As data science applications proliferate, more and more lay users must perform data integration (DI) tasks, which used to be done by sophisticated CS developers. Thus, it is increasingly critical that we develop hands-off DI services, which lay users \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Grulich:2018:CEC, author = "Philipp M. Grulich and Faisal Nawab", title = "Collaborative edge and cloud neural networks for real-time video processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2046--2049", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236256", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The efficient processing of video streams is a key component in many emerging Internet of Things (IoT) and edge applications, such as Virtual and Augmented Reality (V/AR) and self-driving cars. These applications require real-time high-throughput video \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Agrawal:2018:DAA, author = "Ashvin Agrawal and Avrilia Floratou", title = "{Dhalion} in action: automatic management of streaming applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2050--2053", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236257", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In a world where organizations are being inundated with data from various sources, analyzing data and gaining actionable insights in real-time has become a key service differentiator. Over the last few years, several stream processing frameworks have \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Karlas:2018:EMA, author = "Bojan Karlas and Ji Liu and Wentao Wu and Ce Zhang", title = "{Ease.ml} in action: towards multi-tenant declarative learning services", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2054--2057", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236258", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate ease.ml, a multi-tenant machine learning service we host at ETH Zurich for various research groups. Unlike existing machine learning services, ease.ml presents a novel architecture that supports multi-tenant, cost-aware model selection \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Neto:2018:MMC, author = "Antonio Cavalcante Araujo Neto and Mario A. Nascimento and Joerg Sander and Ricardo J. G. B. Campello", title = "{MustaCHE}: a multiple clustering hierarchies explorer", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2058--2061", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236259", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demonstration paper we introduce MustaCHE ( Multiple Clustering Hierarchies Explorer ), a tool that allows analysis and exploration of multiple clustering hierarchies in an interactive and visual manner. A known issue in the context of density-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Salimi:2018:HDD, author = "Babak Salimi and Corey Cole and Peter Li and Johannes Gehrke and Dan Suciu", title = "{HypDB}: a demonstration of detecting, explaining and resolving bias in {OLAP} queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2062--2065", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236260", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "On line analytical processing (OLAP) is an essential element of decision-support systems. However, OLAP queries can be biased and lead to perplexing and incorrect insights. In this demo, we present HypDB, the first system to detect, explain and resolve \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Picado:2018:LEH, author = "Jose Picado and Arash Termehchy and Sudhanshu Pathak", title = "Learning efficiently over heterogeneous databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2066--2069", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236261", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a relational database and training examples for a target relation, relational learning algorithms learn a Datalog program that defines the target relation in terms of the existing relations in the database. We demonstrate CastorX, a relational \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{dosSantos:2018:SED, author = "Walter dos Santos and Gustavo P. Avelar and Manoel Horta Ribeiro and Dorgival Guedes and Wagner Meira", title = "Scalable and efficient data analytics and mining with lemonade", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2070--2073", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236262", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Professionals outside of the area of Computer Science have an increasing need to analyze large bodies of data. This analysis often demands high level of security and has to be done in the cloud. However, current data analysis tools that demand little \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Trummer:2018:SRB, author = "Immanuel Trummer and Samuel Moseley and Deepak Maram and Saehan Jo and Joseph Antonakakis", title = "{SkinnerDB}: regret-bounded query evaluation via reinforcement learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2074--2077", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236263", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Robust query optimization becomes illusory in the presence of correlated predicates or user-defined functions. Occasionally, the query optimizer will choose join orders whose execution time is by many orders of magnitude higher than necessary. We \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Vo:2018:ISD, author = "Hoang Vo and Yanhui Liang and Jun Kong and Fusheng Wang", title = "{iSPEED}: a scalable and distributed in-memory based spatial query system for large and structurally complex {3D} data", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2078--2081", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236264", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The recent technological advancement in digital pathology has enabled 3D tissue-based investigation of human diseases at extremely high resolutions. Discovering and verifying spatial patterns among massive 3D micro-anatomic biological objects such as \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Silva:2018:DRD, author = "V{\'\i}tor Silva and Daniel de Oliveira and Patrick Valduriez and Marta Mattoso", title = "{DfAnalyzer}: runtime dataflow analysis of scientific applications using provenance", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2082--2085", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236265", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present DfAnalyzer, a tool that enables monitoring, debugging, steering, and analysis of dataflows while being generated by scientific applications. It works by capturing strategic domain data, registering provenance and execution data to enable \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hynes:2018:DSP, author = "Nick Hynes and David Dao and David Yan and Raymond Cheng and Dawn Song", title = "A demonstration of {Sterling}: a privacy-preserving data marketplace", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2086--2089", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236266", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this work, we demonstrate Sterling, a decentralized marketplace for private data. Sterling enables privacy-preserving distribution and use of data by using privacy-preserving smart contracts which run on a permissionless blockchain. The privacy-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cao:2018:CCT, author = "Yang Cao and Li Xiong and Masatoshi Yoshikawa and Yonghui Xiao and Si Zhang", title = "{ConTPL}: controlling temporal privacy leakage in differentially private continuous data release", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2090--2093", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3236267", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In many real-world systems, such as Internet of Thing, sensitive data streams are collected and analyzed continually. To protect privacy, a number of mechanisms are designed to achieve $ \epsilon $-differential privacy for processing sensitive streaming data, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dong:2018:DIM, author = "Xin Luna Dong and Theodoros Rekatsinas", title = "Data integration and machine learning: a natural synergy", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2094--2097", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229876", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As data volume and variety have increased, so have the ties between machine learning and data integration become stronger. For machine learning to be effective, one must utilize data from the greatest possible variety of sources; and this is why data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Maiyya:2018:DDC, author = "Sujaya Maiyya and Victor Zakhary and Divyakant Agrawal and Amr {El Abbadi}", title = "Database and distributed computing fundamentals for scalable, fault-tolerant, and consistent maintenance of blockchains", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2098--2101", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229877", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Bitcoin is a successful and interesting example of a global scale peer-to-peer cryptocurrency that integrates many techniques and protocols from cryptography, distributed systems, and databases. The main underlying data structure is blockchain, a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Faloutsos:2018:FBT, author = "Christos Faloutsos and Jan Gasthaus and Tim Januschowski and Yuyang Wang", title = "Forecasting big time series: old and new", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2102--2105", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229878", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Time series forecasting is a key ingredient in the automation and optimization of business processes: in retail, deciding which products to order and where to store them depends on the forecasts of future demand in different regions; in cloud computing, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Deutsch:2018:GDM, author = "Alin Deutsch and Yannis Papakonstantinou", title = "Graph data models, query languages and programming paradigms", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2106--2109", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229879", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Numerous databases support semi-structured, schemaless and heterogeneous data, typically in the form of graphs (often restricted to trees and nested data). They also provide corresponding high-level query languages or graph-tailored programming \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cazalens:2018:CFC, author = "Sylvie Cazalens and Julien Leblay and Philippe Lamarre and Ioana Manolescu and Xavier Tannier", title = "Computational fact checking: a content management perspective", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2110--2113", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3229880", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data journalism designates journalistic work inspired by digital data sources. A particularly popular and active area of data journalism is concerned with fact-checking. The term was born in the journalist community and referred the process of verifying \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Furtado:2018:IDM, author = "Antonio L. Furtado and Nivio Ziviani", title = "Information and data management at {PUC-Rio} and {UFMG}", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2114--2129", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3240490", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This article presents a summary of the main activities of the Database \& Information Systems Research Group at Pontif{\'\i}cia Universidade Cat{\'o}lica do Rio de Janeiro (PUC-Rio) and the Information Management Research Group at Universidade Federal de Minas \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Miller:2018:ODI, author = "Ren{\'e}e J. Miller", title = "Open data integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2130--2139", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3240491", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Open data plays a major role in supporting both governmental and organizational transparency. Many organizations are adopting Open Data Principles promising to make their open data complete, primary, and timely. These properties make this data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cafarella:2018:TYW, author = "Michael Cafarella and Alon Halevy and Hongrae Lee and Jayant Madhavan and Cong Yu and Daisy Zhe Wang and Eugene Wu", title = "Ten years of {WebTables}", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2140--2149", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3240492", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In 2008, we wrote about WebTables, an effort to exploit the large and diverse set of structured databases casually published online in the form of HTML tables. The past decade has seen a flurry of research and commercial activities around the WebTables \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kraska:2018:NID, author = "Tim Kraska", title = "{Northstar}: an interactive data science system", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2150--2164", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3240493", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In order to democratize data science, we need to fundamentally rethink the current analytics stack, from the user interface to the ``guts.'' Most importantly, enabling a broader range of users to unfold the potential of (their) data requires a change in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Stoyanovich:2018:PDD, author = "Julia Stoyanovich and Bill Howe and HV Jagadish and Gerome Miklau", title = "{Panel}: a debate on data and algorithmic ethics", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "12", pages = "2165--2167", month = aug, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3229863.3240494", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:52:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently, there has begun a movement towards Fairness, Accountability, and Transparency (FAT) in algorithmic decision making, and in data science more broadly. The database community has not been significantly involved in this movement, despite ``owning'' \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Thomas:2018:CES, author = "Anthony Thomas and Arun Kumar", title = "A comparative evaluation of systems for scalable linear algebra-based analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "13", pages = "2168--2182", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3275366.3275367", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 11 16:22:00 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The growing use of statistical and machine learning (ML) algorithms to analyze large datasets has given rise to new systems to scale such algorithms. But implementing new scalable algorithms in low-level languages is a painful process, especially for enterprise and scientific users. To mitigate this issue, a new breed of systems expose high-level bulk linear algebra (LA) primitives that are scalable. By composing such LA primitives, users can write analysis algorithms in a higher-level language, while the system handles scalability issues. But there is little work on a unified comparative evaluation of the scalability, efficiency, and effectiveness of such ``scalable LA systems.'' We take a major step towards filling this gap. We introduce a suite of LA-specific tests based on our analysis of the data access and communication patterns of LA workloads and their use cases. Using our tests, we perform a comprehensive empirical comparison of a few popular scalable LA systems: MADlib, MLlib, SystemML, ScaLAPACK, SciDB, and TensorFlow using both synthetic data and a large real-world dataset. Our study has revealed several scalability bottlenecks, unusual performance trends, and even bugs in some systems. Our findings have already led to improvements in SystemML, with other systems' developers also expressing interest. All of our code and data scripts are available for download at https://adalabucsd.github.io/slab.html.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Karthik:2018:CPL, author = "Srinivas Karthik and Jayant R. Haritsa and Sreyash Kenkre and Vinayaka Pandit", title = "A concave path to low-overhead robust query processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "13", pages = "2183--2195", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3275366.3275368", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 11 16:22:00 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To address the classical selectivity estimation problem in database systems, a radically different query processing technique called PlanBouquet was proposed in 2014. In this approach, the estimation process is completely abandoned and replaced with a calibrated selectivity discovery mechanism. The beneficial outcome is that provable guarantees are obtained on worst-case execution performance, thereby facilitating robust query processing. An improved version of PlanBouquet, called SpillBound (SB), which significantly accelerates the selectivity discovery process, and provides platform-independent performance guarantees, was presented two years ago. Notwithstanding its benefits, a limitation of SpillBound is that its guarantees are predicated on expending enormous preprocessing efforts during query compilation, making it suitable only for canned queries that are invoked repeatedly. In this paper, we address this limitation by leveraging the fact that plan cost functions typically exhibit concave down behavior with regard to predicate selectivities. Specifically, we design FrugalSpillBound, which provably achieves extremely attractive tradeoffs between the performance guarantees and the compilation overheads. For instance, relaxing the performance guarantee by a factor of two typically results in at least two orders of magnitude reduction in the overheads. Further, when empirically evaluated on benchmark OLAP queries, the decrease in overheads is even greater, often more than three orders of magnitude. Therefore, FrugalSpillBound substantively extends robust query processing towards supporting ad-hoc queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wen:2018:ISE, author = "Yuhao Wen and Xiaodan Zhu and Sudeepa Roy and Jun Yang", title = "Interactive summarization and exploration of top aggregate query answers", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "13", pages = "2196--2208", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3275366.3275369", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 11 16:22:00 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present a system for summarization and interactive exploration of high-valued aggregate query answers to make a large set of possible answers more informative to the user. Our system outputs a set of clusters on the high-valued query answers showing their common properties such that the clusters are diverse as much as possible to avoid repeating information, and cover a certain number of top original answers as indicated by the user. Further, the system facilitates interactive exploration of the query answers by helping the user (i) choose combinations of parameters for clustering, (ii) inspect the clusters as well as the elements they contain, and (iii) visualize how changes in parameters affect clustering. We define optimization problems, study their complexity, explore properties of the solutions investigating the semi-lattice structure on the clusters, and propose efficient algorithms and optimizations to achieve these goals. We evaluate our techniques experimentally and discuss our prototype with a graphical user interface that facilitates this interactive exploration. A user study is conducted to evaluate the usability of our approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kersten:2018:EYA, author = "Timo Kersten and Viktor Leis and Alfons Kemper and Thomas Neumann and Andrew Pavlo and Peter Boncz", title = "Everything you always wanted to know about compiled and vectorized queries but were afraid to ask", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "13", pages = "2209--2222", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3275366.3275370", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 11 16:22:00 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The query engines of most modern database systems are either based on vectorization or data-centric code generation. These two state-of-the-art query processing paradigms are fundamentally different in terms of system structure and query execution code. Both paradigms were used to build fast systems. However, until today it is not clear which paradigm yields faster query execution, as many implementation-specific choices obstruct a direct comparison of architectures. In this paper, we experimentally compare the two models by implementing both within the same test system. This allows us to use for both models the same query processing algorithms, the same data structures, and the same parallelization framework to ultimately create an apples-to-apples comparison. We find that both are efficient, but have different strengths and weaknesses. Vectorization is better at hiding cache miss latency, whereas data-centric compilation requires fewer CPU instructions, which benefits cache-resident workloads. Besides raw, single-threaded performance, we also investigate SIMD as well as multi-core parallelization and different hardware architectures. Finally, we analyze qualitative differences as a guide for system architects.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gao:2018:DTK, author = "Junyang Gao and Pankaj K. Agarwal and Jun Yang", title = "Durable top-$k$ queries on temporal data", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "13", pages = "2223--2235", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3275366.3275371", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 11 16:22:00 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many datasets have a temporal dimension and contain a wealth of historical information. When using such data to make decisions, we often want to examine not only the current snapshot of the data but also its history. For example, given a result object of a snapshot query, we can ask for its ``durability,'' or intuitively, how long (or how often) it was valid in the past. This paper considers durable top-k queries, which look for objects whose values were among the top k for at least some fraction of the times during a given interval---e.g., stocks that were among the top 20 most heavily traded for at least 80\% of the trading days during the last quarter of 2017. We present a comprehensive suite of techniques for solving this problem, ranging from exact algorithms where k is fixed in advance, to approximate methods that work for any k and are able to exploit workload and data characteristics to improve accuracy while capping index cost. We show that our methods vastly outperform baseline and previous methods using both real and synthetic datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Linardi:2018:SVL, author = "Michele Linardi and Themis Palpanas", title = "Scalable, variable-length similarity search in data series: the {ULISSE} approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "13", pages = "2236--2248", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3275366.3275372", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 11 16:22:00 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data series similarity search is an important operation and at the core of several analysis tasks and applications related to data series collections. Despite the fact that data series indexes enable fast similarity search, all existing indexes can only answer queries of a single length (fixed at index construction time), which is a severe limitation. In this work, we propose ULISSE, the first data series index structure designed for answering similarity search queries of variable length. Our contribution is two-fold. First, we introduce a novel representation technique, which effectively and succinctly summarizes multiple sequences of different length (irrespective of Z-normalization). Based on the proposed index, we describe efficient algorithms for approximate and exact similarity search, combining disk based index visits and in-memory sequential scans. We experimentally evaluate our approach using several synthetic and real datasets. The results show that ULISSE is several times (and up to orders of magnitude) more efficient in terms of both space and time cost, when compared to competing approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sauer:2018:FLS, author = "Caetano Sauer and Goetz Graefe and Theo H{\"a}rder", title = "{FineLine}: log-structured transactional storage and recovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "13", pages = "2249--2262", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3275366.3275373", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 11 16:22:00 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recovery is an intricate aspect of transaction processing architectures. In its traditional implementation, recovery requires the management of two persistent data stores---a write-ahead log and a materialized database---which must be carefully orchestrated to maintain transactional consistency. Furthermore, the design and implementation of recovery algorithms have deep ramifications into almost every component of the internal system architecture, from concurrency control to buffer management and access path implementation. Such complexity not only incurs high costs for development, testing, and training, but also unavoidably affects system performance, introducing overheads and limiting scalability. This paper proposes a novel approach for transactional storage and recovery called FineLine. It simplifies the implementation of transactional database systems by eliminating the log-database duality and maintaining all persistent data in a single, log-structured data structure. This approach not only provides more efficient recovery with less overhead, but also decouples the management of persistent data from in-memory access paths. As such, it blurs the lines that separate in-memory from disk-based database systems, providing the efficiency of the former with the reliability of the latter.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rahman:2018:IMH, author = "Protiva Rahman and Courtney Hebert and Arnab Nandi", title = "{ICARUS}: minimizing human effort in iterative data completion", journal = j-PROC-VLDB-ENDOWMENT, volume = "11", number = "13", pages = "2263--2276", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3275366.3275374", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 11 16:22:00 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "An important step in data preparation involves dealing with incomplete datasets. In some cases, the missing values are unreported because they are characteristics of the domain and are known by practitioners. Due to this nature of the missing values, imputation and inference methods do not work and input from domain experts is required. A common method for experts to fill missing values is through rules. However, for large datasets with thousands of missing data points, it is laborious and time consuming for a user to make sense of the data and formulate effective completion rules. Thus, users need to be shown subsets of the data that will have the most impact in completing missing fields. Further, these subsets should provide the user with enough information to make an update. Choosing subsets that maximize the probability of filling in missing data from a large dataset is computationally expensive. To address these challenges, we present Icarus, which uses a heuristic algorithm to show the user small subsets of the database in the form of a matrix. This allows the user to iteratively fill in data by applying suggested rules based on their direct edits to the matrix. The suggested rules amplify the users' input to multiple missing fields by using the database schema to infer hierarchies. Simulations show Icarus has an average improvement of 50\% across three datasets over the baseline system. Further, in-person user studies demonstrate that naive users can fill in 68\% of missing data within an hour, while manual rule specification spans weeks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kim:2018:LIW, author = "Sunghwan Kim and Taesung Lee and Seung-won Hwang and Sameh Elnikety", title = "List intersection for web search: algorithms, cost models, and optimizations", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "1", pages = "1--13", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3275536.3275537", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 2 18:29:47 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper studies the optimization of list intersection, especially in the context of the matching phase of search engines. Given a user query, we intersect the postings lists corresponding to the query keywords to generate the list of documents matching all keywords. Since the speed of list intersection depends the algorithm, hardware, and list lengths and their correlations, none the existing intersection algorithms outperforms the others in every scenario. Therefore, we develop a cost-based approach in which we identify a search space, spanning existing algorithms and their combinations. We propose a cost model to estimate the cost of the algorithms with their combinations, and use the cost model to search for the lowest-cost algorithm. The resulting plan is usually a combination of 2-way algorithms, outperforming conventional 2-way and k -way algorithms. The proposed approach is more general than designing a specific algorithm, as the cost models can be adapted to different hardware. We validate the cost model experimentally on two different CPUs, and show that the cost model closely estimates the actual cost. Using both real and synthetic datasets, we show that the proposed cost-based optimizer outperforms the state-of-the-art alternatives.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Whittaker:2018:ICC, author = "Michael Whittaker and Joseph M. Hellerstein", title = "Interactive checks for coordination avoidance", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "1", pages = "14--27", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3275536.3275538", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 2 18:29:47 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Strongly consistent distributed systems are easy to reason about but face fundamental limitations in availability and performance. Weakly consistent systems can be implemented with very high performance but place a burden on the application developer to reason about complex interleavings of execution. Invariant confluence provides a formal framework for understanding when we can get the best of both worlds. An invariant confluent object can be efficiently replicated with no coordination needed to preserve its invariants. However, actually determining whether or not an object is invariant confluent is challenging. In this paper, we establish conditions under which a commonly used sufficient condition for invariant confluence is both necessary and sufficient, and we use this condition to design (a) a general-purpose interactive invariant confluence decision procedure and (b) a novel sufficient condition that can be checked automatically. We then take a step beyond invariant confluence and introduce a generalization of invariant confluence, called segmented invariant confluence, that allows us to replicate non-invariant confluent objects with a small amount of coordination. We implemented these formalisms in a prototype called Lucy and found that our decision procedures efficiently handle common real-world workloads including foreign keys, rollups, escrow transactions, and more. We also found that segmented invariant confluent replication can deliver up to an order of magnitude more throughput than linearizable replication for low contention workloads and comparable throughput for medium to high contention workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qin:2018:PPF, author = "Jianbin Qin and Chuan Xiao", title = "{Pigeonring}: a principle for faster thresholded similarity search", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "1", pages = "28--42", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3275536.3275539", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 2 18:29:47 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The pigeonhole principle states that if n items are contained in m boxes, then at least one box has no more than n/m items. It is utilized to solve many data management problems, especially for thresholded similarity searches. Despite many pigeonhole principle-based solutions proposed in the last few decades, the condition stated by the principle is weak. It only constrains the number of items in a single box. By organizing the boxes in a ring, we propose a new principle, called the pigeonring principle, which constrains the number of items in multiple boxes and yields stronger conditions. To utilize the new principle, we focus on problems defined in the form of identifying data objects whose similarities or distances to the query is constrained by a threshold. Many solutions to these problems utilize the pigeonhole principle to find candidates that satisfy a filtering condition. By the new principle, stronger filtering conditions can be established. We show that the pigeonhole principle is a special case of the new principle. This suggests that all the pigeonhole principle-based solutions are possible to be accelerated by the new principle. A universal filtering framework is introduced to encompass the solutions to these problems based on the new principle. Besides, we discuss how to quickly find candidates specified by the new principle. The implementation requires only minor modifications on top of existing pigeonhole principle-based algorithms. Experimental results on real datasets demonstrate the applicability of the new principle as well as the superior performance of the algorithms based on the new principle.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sariyuce:2018:LAH, author = "Ahmet Erdem Sariy{\"u}ce and C. Seshadhri and Ali Pinar", title = "Local algorithms for hierarchical dense subgraph discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "1", pages = "43--56", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3275536.3275540", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 2 18:29:47 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Finding the dense regions of a graph and relations among them is a fundamental problem in network analysis. Core and truss decompositions reveal dense subgraphs with hierarchical relations. The incremental nature of algorithms for computing these decompositions and the need for global information at each step of the algorithm hinders scalable parallelization and approximations since the densest regions are not revealed until the end. In a previous work, Lu et al. proposed to iteratively compute the h -indices of neighbor vertex degrees to obtain the core numbers and prove that the convergence is obtained after a finite number of iterations. This work generalizes the iterative h -index computation for truss decomposition as well as nucleus decomposition which leverages higher-order structures to generalize core and truss decompositions. In addition, we prove convergence bounds on the number of iterations. We present a framework of local algorithms to obtain the core, truss, and nucleus decompositions. Our algorithms are local, parallel, offer high scalability, and enable approximations to explore time and quality trade-offs. Our shared-memory implementation verifies the efficiency, scalability, and effectiveness of our local algorithms on real-world networks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2018:CED, author = "Jingru Yang and Ju Fan and Zhewei Wei and Guoliang Li and Tongyu Liu and Xiaoyong Du", title = "Cost-effective data annotation using game-based crowdsourcing", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "1", pages = "57--70", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3275536.3275541", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 2 18:29:47 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large-scale data annotation is indispensable for many applications, such as machine learning and data integration. However, existing annotation solutions either incur expensive cost for large datasets or produce noisy results. This paper introduces a cost-effective annotation approach, and focuses on the labeling rule generation problem that aims to generate high-quality rules to largely reduce the labeling cost while preserving quality. To address the problem, we first generate candidate rules, and then devise a game-based crowdsourcing approach C ROWDGAME to select high-quality rules by considering coverage and precision. CROWDGAME employs two groups of crowd workers: one group answers rule validation tasks (whether a rule is valid) to play a role of rule generator, while the other group answers tuple checking tasks (whether the annotated label of a data tuple is correct) to play a role of rule refuter. We let the two groups play a two-player game: rule generator identifies high-quality rules with large coverage and precision, while rule refuter tries to refute its opponent rule generator by checking some tuples that provide enough evidence to reject rules covering the tuples. This paper studies the challenges in CROWDGAME. The first is to balance the trade-off between coverage and precision. We define the loss of a rule by considering the two factors. The second is rule precision estimation. We utilize Bayesian estimation to combine both rule validation and tuple checking tasks. The third is to select crowdsourcing tasks to fulfill the game-based framework for minimizing the loss. We introduce a minimax strategy and develop efficient task selection algorithms. We conduct experiments on entity matching and relation extraction, and the results show that our method outperforms state-of-the-art solutions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2018:OAL, author = "Enhui Huang and Liping Peng and Luciano {Di Palma} and Ahmed Abdelkafi and Anna Liu and Yanlei Diao", title = "Optimization for active learning-based interactive database exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "1", pages = "71--84", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3275536.3275542", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 2 18:29:47 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "There is an increasing gap between fast growth of data and limited human ability to comprehend data. Consequently, there has been a growing demand of data management tools that can bridge this gap and help the user retrieve high-value content from data more effectively. In this work, we aim to build interactive data exploration as a new database service, using an approach called ``explore-by-example''. In particular, we cast the explore-by-example problem in a principled ``active learning'' framework, and bring the properties of important classes of database queries to bear on the design of new algorithms and optimizations for active learning-based database exploration. These new techniques allow the database system to overcome a fundamental limitation of traditional active learning, i.e., the slow convergence problem. Evaluation results using real-world datasets and user interest patterns show that our new system significantly outperforms state-of-the-art active learning techniques and data exploration systems in accuracy while achieving desired efficiency for interactive performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bleifuss:2018:ECN, author = "Tobias Bleifu{\ss} and Leon Bornemann and Theodore Johnson and Dmitri V. Kalashnikov and Felix Naumann and Divesh Srivastava", title = "Exploring change: a new dimension of data analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "2", pages = "85--98", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3282495.3282496", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 2 18:29:48 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data and metadata in datasets experience many different kinds of change. Values are inserted, deleted or updated; rows appear and disappear; columns are added or repurposed, etc. In such a dynamic situation, users might have many questions related to changes in the dataset, for instance which parts of the data are trustworthy and which are not? Users will wonder: How many changes have there been in the recent minutes, days or years? What kind of changes were made at which points of time? How dirty is the data? Is data cleansing required? The fact that data changed can hint at different hidden processes or agendas: a frequently crowd-updated city name may be controversial; a person whose name has been recently changed may be the target of vandalism; and so on. We show various use cases that benefit from recognizing and exploring such change. We envision a system and methods to interactively explore such change, addressing the variability dimension of big data challenges. To this end, we propose a model to capture change and the process of exploring dynamic data to identify salient changes. We provide exploration primitives along with motivational examples and measures for the volatility of data. We identify technical challenges that need to be addressed to make our vision a reality, and propose directions of future work for the data management community.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ghosh:2018:FSS, author = "Bishwamittra Ghosh and Mohammed Eunus Ali and Farhana M. Choudhury and Sajid Hasan Apon and Timos Sellis and Jianxin Li", title = "The flexible socio spatial group queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "2", pages = "99--111", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3282495.3282497", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 2 18:29:48 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A socio spatial group query finds a group of users who possess strong social connections with each other and have the minimum aggregate spatial distance to a meeting point. Existing studies limit to either finding the best group of a fixed size for a single meeting location, or a single group of a fixed size w.r.t. multiple locations. However, it is highly desirable to consider multiple locations in a real-life scenario in order to organize impromptu activities of groups of various sizes. In this paper, we propose Top k Flexible Socio Spatial Group Query (Top k-FSSGQ) to find the top k groups w.r.t. multiple POIs where each group follows the minimum social connectivity constraints. We devise a ranking function to measure the group score by combining social closeness, spatial distance, and group size, which provides the flexibility of choosing groups of different sizes under different constraints. To effectively process the Top k-FSSGQ, we first develop an Exact approach that ensures early termination of the search based on the derived upper bounds. We prove that the problem is NP-hard, hence we first present a heuristic based approximation algorithm to effectively select members in intermediate solution groups based on the social connectivity of the users. Later we design a Fast Approximate approach based on the relaxed social and spatial bounds, and connectivity constraint heuristic. Experimental studies have verified the effectiveness and efficiency of our proposed approaches on real datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Echihabi:2018:LHD, author = "Karima Echihabi and Kostas Zoumpatianos and Themis Palpanas and Houda Benbrahim", title = "The {Lernaean Hydra} of data series similarity search: an experimental evaluation of the state of the art", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "2", pages = "112--127", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3282495.3282498", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 2 18:29:48 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Increasingly large data series collections are becoming commonplace across many different domains and applications. A key operation in the analysis of data series collections is similarity search, which has attracted lots of attention and effort over the past two decades. Even though several relevant approaches have been proposed in the literature, none of the existing studies provides a detailed evaluation against the available alternatives. The lack of comparative results is further exacerbated by the non-standard use of terminology, which has led to confusion and misconceptions. In this paper, we provide definitions for the different flavors of similarity search that have been studied in the past, and present the first systematic experimental evaluation of the efficiency of data series similarity search techniques. Based on the experimental results, we describe the strengths and weaknesses of each approach and give recommendations for the best approach to use under typical use cases. Finally, by identifying the shortcomings of each method, our findings lay the ground for solid further developments in the field.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2018:RML, author = "Wei Wang and Jinyang Gao and Meihui Zhang and Sheng Wang and Gang Chen and Teck Khim Ng and Beng Chin Ooi and Jie Shao and Moaz Reyad", title = "{Rafiki}: machine learning as an analytics service system", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "2", pages = "128--140", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3282495.3282499", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 2 18:29:48 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Big data analytics is gaining massive momentum in the last few years. Applying machine learning models to big data has become an implicit requirement or an expectation for most analysis tasks, especially on high-stakes applications. Typical applications include sentiment analysis against reviews for analyzing on-line products, image classification in food logging applications for monitoring user's daily intake, and stock movement prediction. Extending traditional database systems to support the above analysis is intriguing but challenging. First, it is almost impossible to implement all machine learning models in the database engines. Second, expert knowledge is required to optimize the training and inference procedures in terms of efficiency and effectiveness, which imposes heavy burden on the system users. In this paper, we develop and present a system, called Rafiki, to provide the training and inference service of machine learning models. Rafiki provides distributed hyper-parameter tuning for the training service, and online ensemble modeling for the inference service which trades off between latency and accuracy. Experimental results confirm the efficiency, effectiveness, scalability and usability of Rafiki.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Subotic:2018:AIS, author = "Pavle Suboti{\'c} and Herbert Jordan and Lijun Chang and Alan Fekete and Bernhard Scholz", title = "Automatic index selection for large-scale datalog computation", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "2", pages = "141--153", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3282495.3282500", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 2 18:29:48 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Datalog has been applied to several use cases that require very high performance on large rulesets and factsets. It is common to create indexes for relations to improve search performance. However, the existing indexing schemes either require manual index selection or result in insufficient performance on very large tasks. In this paper, we propose an automatic scheme to select indexes. We automatically create the minimum number of indexes to speed up all the searches in a given Datalog program. We have integrated our indexing scheme into an open-source Datalog engine S OUFFL{\'E}. We obtain performance on a par with what users have accepted from hand-optimized Datalog programs running on state-of-the-art Datalog engines, while we do not require the effort of manual index selection. Extensive experiments on large real Datalog programs demonstrate that our indexing scheme results in considerable speedups (up to 2x) and significantly less memory usage (up to 6x) compared with other automated index selections.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Song:2018:SLF, author = "Shuang Song and Xu Liu and Qinzhe Wu and Andreas Gerstlauer and Tao Li and Lizy K. John", title = "Start late, finish early: a distributed graph processing system with redundancy reduction", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "2", pages = "154--168", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3282495.3282501", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 2 18:29:48 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph processing systems are important in the big data domain. However, processing graphs in parallel often introduces redundant computations in existing algorithms and models. Prior work has proposed techniques to optimize redundancies for out-of-core graph systems, rather than distributed graph systems. In this paper, we study various state-of-the-art distributed graph systems and observe root causes for these pervasively existing redundancies. To reduce redundancies without sacrificing parallelism, we further propose SLFE, a distributed graph processing system, designed with the principle of ``start late or finish early''. SLFE employs a novel preprocessing stage to obtain a graph's topological knowledge with negligible overhead. SLFE's redundancy-aware vertex-centric computation model can then utilize such knowledge to reduce the redundant computations at runtime. SLFE also provides a set of APIs to improve programmability. Our experiments on an 8-machine high-performance cluster show that SLFE outperforms all well-known distributed graph processing systems with the inputs of real-world graphs, yielding up to 75x speedup. Moreover, SLFE outperforms two state-of-the-art shared memory graph systems on a high-end machine with up to 1644x speedup. SLFE's redundancy-reduction schemes are generally applicable to other vertex-centric graph processing systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ding:2018:IOC, author = "Bailu Ding and Lucja Kot and Johannes Gehrke", title = "Improving optimistic concurrency control through transaction batching and operation reordering", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "2", pages = "169--182", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3282495.3282502", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 2 18:29:48 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "OLTP systems can often improve throughput by batching transactions and processing them as a group. Batching has been used for optimizations such as message packing and group commits; however, there is little research on the benefits of a holistic approach to batching across a transaction's entire life cycle. In this paper, we present a framework to incorporate batching at multiple stages of transaction execution for OLTP systems based on optimistic concurrency control. Storage batching enables reordering of transaction reads and writes at the storage layer, reducing conflicts on the same object. Validator batching enables reordering of transactions before validation, reducing conflicts between transactions. Dependencies between transactions make transaction reordering a non-trivial problem, and we propose several efficient and practical algorithms that can be customized to various transaction precedence policies such as reducing tail latency. We also show how to reorder transactions with a thread-aware policy in multi-threaded OLTP architecture without a centralized validator. In-depth experiments on a research prototype, an opensource OLTP system, and a production OLTP system show that our techniques increase transaction throughput by up to 2.2x and reduce their tail latency by up to 71\% compared with the start-of-the-art systems on workloads with high data contention.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xie:2018:QLC, author = "Ting Xie and Varun Chandola and Oliver Kennedy", title = "Query log compression for workload analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "3", pages = "183--196", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3291264.3291265", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jan 18 05:54:04 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Analyzing database access logs is a key part of performance tuning, intrusion detection, benchmark development, and many other database administration tasks. Unfortunately, it is common for production databases to deal with millions or more queries each day, so these logs must be summarized before they can be used. Designing an appropriate summary encoding requires trading off between conciseness and information content. For example: simple workload sampling may miss rare, but high impact queries. In this paper, we present L OGR, a lossy log compression scheme suitable for use in many automated log analytics tools, as well as for human inspection. We formalize and analyze the space/fidelity trade-off in the context of a broader family of ``pattern'' and ``pattern mixture'' log encodings to which LOGR belongs. We show through a series of experiments that LOGR compressed encodings can be created efficiently, come with provable information-theoretic bounds on their accuracy, and outperform state-of-art log summarization strategies.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ali:2018:MTC, author = "Mohammed Eunus Ali and Shadman Saqib Eusuf and Kaysar Abdullah and Farhana M. Choudhury and J. Shane Culpepper and Timos Sellis", title = "The maximum trajectory coverage query in spatial databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "3", pages = "197--209", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3291264.3291266", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jan 18 05:54:04 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the widespread use of GPS-enabled mobile devices, an unprecedented amount of trajectory data has become available from various sources such as Bikely, GPS-wayPoints, and Uber. The rise of smart transportation services and recent break-throughs in autonomous vehicles increase our reliance on trajectory data in a wide variety of applications. Supporting these services in emerging platforms requires more efficient query processing in trajectory databases. In this paper, we propose two new coverage queries for trajectory databases: (i) k Best Facility Trajectory Search ( k BFT); and (ii) k Best Coverage Facility Trajectory Search ( k BCovFT). We propose a novel index structure, the Trajectory Quadtree (TQ-tree) that utilizes a quadtree to hierarchically organize trajectories into different nodes, and then applies a z-ordering to further organize the trajectories by spatial locality inside each node. This structure is highly effective in pruning the trajectory search space, which is of independent interest. By exploiting the TQ-tree, we develop a divide-and-conquer approach to efficiently process a k BFT query. To solve the k BCovFT, which is a non-submodular NP-hard problem, we propose a greedy approximation. We evaluate our algorithms through an extensive experimental study on several real datasets, and demonstrate that our algorithms outperform baselines by two to three orders of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2018:TLO, author = "Chenggang Wu and Alekh Jindal and Saeed Amizadeh and Hiren Patel and Wangchao Le and Shi Qiao and Sriram Rao", title = "Towards a learning optimizer for shared clouds", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "3", pages = "210--222", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3291264.3291267", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jan 18 05:54:04 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query optimizers are notorious for inaccurate cost estimates, leading to poor performance. The root of the problem lies in inaccurate cardinality estimates, i.e., the size of intermediate (and final) results in a query plan. These estimates also determine the resources consumed in modern shared cloud infrastructures. In this paper, we present C ARDLEARNER, a machine learning based approach to learn cardinality models from previous job executions and use them to predict the cardinalities in future jobs. The key intuition in our approach is that shared cloud workloads are often recurring and overlapping in nature, and so we could learn cardinality models for overlapping subgraph templates. We discuss various learning approaches and show how learning a large number of smaller models results in high accuracy and explainability. We further present an exploration technique to avoid learning bias by considering alternate join orders and learning cardinality models over them. We describe the feedback loop to apply the learned models back to future job executions. Finally, we show a detailed evaluation of our models (up to 5 orders of magnitude less error), query plans (60\% applicability), performance (up to 100\% faster, 3x fewer resources), and exploration (optimal in few 10s of executions).", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Varma:2018:SAW, author = "Paroma Varma and Christopher R{\'e}", title = "{Snuba}: automating weak supervision to label training data", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "3", pages = "223--236", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3291264.3291268", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jan 18 05:54:04 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As deep learning models are applied to increasingly diverse problems, a key bottleneck is gathering enough high-quality training labels tailored to each task. Users therefore turn to weak supervision, relying on imperfect sources of labels like pattern matching and user-defined heuristics. Unfortunately, users have to design these sources for each task. This process can be time consuming and expensive: domain experts often perform repetitive steps like guessing optimal numerical thresholds and developing informative text patterns. To address these challenges, we present Snuba, a system to automatically generate heuristics using a small labeled dataset to assign training labels to a large, unlabeled dataset in the weak supervision setting. Snuba generates heuristics that each labels the subset of the data it is accurate for, and iteratively repeats this process until the heuristics together label a large portion of the unlabeled data. We develop a statistical measure that guarantees the iterative process will automatically terminate before it degrades training label quality. Snuba automatically generates heuristics in under five minutes and performs up to 9.74 F1 points better than the best known user-defined heuristics developed over many days. In collaborations with users at research labs, Stanford Hospital, and on open source datasets, Snuba outperforms other automated approaches like semi-supervised learning by up to 14.35 F1 points.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Asudeh:2018:OSR, author = "Abolfazl Asudeh and H. V. Jagadish and Gerome Miklau and Julia Stoyanovich", title = "On obtaining stable rankings", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "3", pages = "237--250", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3291264.3291269", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jan 18 05:54:04 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Decision making is challenging when there is more than one criterion to consider. In such cases, it is common to assign a goodness score to each item as a weighted sum of its attribute values and rank them accordingly. Clearly, the ranking obtained depends on the weights used for this summation. Ideally, one would want the ranked order not to change if the weights are changed slightly. We call this property stability of the ranking. A consumer of a ranked list may trust the ranking more if it has high stability. A producer of a ranked list prefers to choose weights that result in a stable ranking, both to earn the trust of potential consumers and because a stable ranking is intrinsically likely to be more meaningful. In this paper, we develop a framework that can be used to assess the stability of a provided ranking and to obtain a stable ranking within an ``acceptable'' range of weight values (called ``the region of interest''). We address the case where the user cares about the rank order of the entire set of items, and also the case where the user cares only about the top- k items. Using a geometric interpretation, we propose algorithms that produce stable rankings. In addition to theoretical analyses, we conduct extensive experiments on real datasets that validate our proposal.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ji:2018:PTB, author = "Shuping Ji and Hans-Arno Jacobsen", title = "{PS}-tree-based efficient {Boolean} expression matching for high-dimensional and dense workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "3", pages = "251--264", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3291264.3291270", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jan 18 05:54:04 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Boolean expression matching is an important function for many applications. However, existing solutions still suffer from limitations when applied to high-dimensional and dense workloads. To overcome these limitations, in this paper, we design a data structure called PS-Tree that can efficiently index subscriptions in one dimension. By dividing predicates into disjoint predicate spaces, PS-Tree achieves high matching performance and good expressiveness. Based on PS-Tree, we first propose a Boolean expression matching algorithm PSTBloom. By efficiently filtering out a large proportion of unmatching subscriptions, PSTBloom achieves high matching performance, especially for high-dimensional workloads. PSTBloom also achieves fast index construction and a small memory footprint. Compared with state-of-the-art methods, comprehensive experiments show that PSTBloom reduces matching time, index construction time and memory usage by up to 84\%, 78\% and 94\%, respectively. Although PSTBloom is effective for many workload distributions, dense workloads represent new challenges to PSTBloom and other algorithms. To effectively handle dense workloads, we further propose the PSTHash algorithm, which divides subscriptions into disjoint multidimensional predicate spaces. This organization prunes partially matching subscriptions efficiently. Comprehensive experiments on both synthetic and real-world datasets show that PSTHash improves the matching performance by up to 92\% for dense workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yan:2018:SMR, author = "Yizhou Yan and Lei Cao and Samuel Madden and Elke A. Rundensteiner", title = "{SWIFT}: mining representative patterns from large event streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "3", pages = "265--277", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3291264.3291271", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jan 18 05:54:04 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Event streams generated by smart devices common in modern Internet of Things applications must be continuously mined to monitor the behavior of the underlying system. In this work, we propose a stream pattern mining system for supporting online IoT applications. First, to solve the pattern explosion problem of existing stream pattern mining strategies, we now design pattern semantics that continuously produce a compact set of patterns that max-imumly compresses the dynamic data streams, called MDL-based Representative Patterns (MRP). We then design a one-pass SWIFT approach that continuously mines the up-to-date MRP pattern set for each stream window upon the arrival or expiration of individual events. We show that SWIFT is guaranteed to select the update operation for each individual incoming event that leads to the most compact encoding of the sequence in the current window. We further enhance SWIFT to support batch updates, called B-SWIFT. B-SWIFT adopts a lazy update strategy that guarantees that only the minimal number of operations are conducted to process an incoming event batch for MRP pattern mining. Evaluation by our industry lighting lab collaborator demonstrates that SWIFT successfully solves their use cases and finds more representative patterns than the alternative approaches adapting the state-of-the-art static representative pattern mining methods. Our experimental study confirms that SWIFT outperforms the best existing method up to 50\% in the compactness of produced pattern encodings, while providing a 4 orders of magnitude speedup.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{C:2018:SSS, author = "Paul Suganthan G. C. and Adel Ardalan and AnHai Doan and Aditya Akella", title = "{Smurf}: self-service string matching using random forests", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "3", pages = "278--291", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3291264.3291272", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jan 18 05:54:04 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We argue that more attention should be devoted to developing self-service string matching (SM) solutions, which lay users can easily use. We show that Falcon, a self-service entity matching (EM) solution, can be applied to SM and is more accurate than current self-service SM solutions. However, Falcon often asks lay users to label many string pairs (e.g., 770-1050 in our experiments). This is expensive, can significantly compound labeling mistakes, and takes a long time. We developed Smurf, a self-service SM solution that reduces the labeling effort by 43-76\%, yet achieves comparable F$_1$ accuracy. The key to make Smurf possible is a novel solution to efficiently execute a random forest (that Smurf learns via active learning with the lay user) over two sets of strings. This solution uses RDBMS-style plan optimization to reuse computations across the trees in the forest. As such, Smurf significantly advances self-service SM and raises interesting future directions for self-service EM and scalable random forest execution over structured data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Liu:2018:CSD, author = "Feilong Liu and Ario Salmasi and Spyros Blanas and Anastasios Sidiropoulos", title = "Chasing similarity: distribution-aware aggregation scheduling", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "3", pages = "292--306", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3291264.3291273", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jan 18 05:54:04 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Parallel aggregation is a ubiquitous operation in data analytics that is expressed as GROUP BY in SQL, reduce in Hadoop, or segment in TensorFlow. Parallel aggregation starts with an optional local pre-aggregation step and then repartitions the intermediate result across the network. While local pre-aggregation works well for low-cardinality aggregations, the network communication cost remains significant for high-cardinality aggregations even after local pre-aggregation. The problem is that the repartition-based algorithm for high-cardinality aggregation does not fully utilize the network. In this work, we first formulate a mathematical model that captures the performance of parallel aggregation. We prove that finding optimal aggregation plans from a known data distribution is NP-hard, assuming the Small Set Expansion conjecture. We propose GRASP, a GReedy Aggregation Scheduling Protocol that decomposes parallel aggregation into phases. GRASP is distribution-aware as it aggregates the most similar partitions in each phase to reduce the transmitted data size in subsequent phases. In addition, GRASP takes the available network bandwidth into account when scheduling aggregations in each phase to maximize network utilization. The experimental evaluation on real data shows that GRASP outperforms repartition-based aggregation by 3.5x and LOOM by 2.0x.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bater:2018:SES, author = "Johes Bater and Xi He and William Ehrich and Ashwin Machanavajjhala and Jennie Rogers", title = "{Shrinkwrap}: efficient {SQL} query processing in differentially private data federations", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "3", pages = "307--320", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3291264.3291274", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jan 18 05:54:04 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A private data federation is a set of autonomous databases that share a unified query interface offering in-situ evaluation of SQL queries over the union of the sensitive data of its members. Owing to privacy concerns, these systems do not have a trusted data collector that can see all their data and their member databases cannot learn about individual records of other engines. Federations currently achieve this goal by evaluating queries obliviously using secure multiparty computation. This hides the intermediate result cardinality of each query operator by exhaustively padding it. With cascades of such operators, this padding accumulates to a blow-up in the output size of each operator and a proportional loss in query performance. Hence, existing private data federations do not scale well to complex SQL queries over large datasets. We introduce Shrinkwrap, a private data federation that offers data owners a differentially private view of the data held by others to improve their performance over oblivious query processing. Shrinkwrap uses computational differential privacy to minimize the padding of intermediate query results, achieving up to a 35X performance improvement over oblivious query processing. When the query needs differentially private output, Shrinkwrap provides a trade-off between result accuracy and query evaluation performance.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gill:2018:SPP, author = "Gurbinder Gill and Roshan Dathathri and Loc Hoang and Keshav Pingali", title = "A study of partitioning policies for graph analytics on large-scale distributed platforms", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "4", pages = "321--334", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3297753.3297754", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed-memory clusters are used for in-memory processing of very large graphs with billions of nodes and edges. This requires partitioning the graph among the machines in the cluster. When a graph is partitioned, a node in the graph may be replicated on several machines, and communication is required to keep these replicas synchronized. Good partitioning policies attempt to reduce this synchronization overhead while keeping the computational load balanced across machines. A number of recent studies have looked at ways to control replication of nodes, but these studies are not conclusive because they were performed on small clusters with eight to sixteen machines, did not consider work-efficient data-driven algorithms, or did not optimize communication for the partitioning strategies they studied. This paper presents an experimental study of partitioning strategies for work-efficient graph analytics applications on large KNL and Skylake clusters with up to 256 machines using the Gluon communication runtime which implements partitioning-specific communication optimizations. Evaluation results show that although simple partitioning strategies like Edge-Cuts perform well on a small number of machines, an alternative partitioning strategy called Cartesian Vertex-Cut (CVC) performs better at scale even though paradoxically it has a higher replication factor and performs more communication than Edge-Cut partitioning does. Results from communication micro-benchmarks resolve this paradox by showing that communication overhead depends not only on communication volume but also on the communication pattern among the partitions. These experiments suggest that high-performance graph analytics systems should support multiple partitioning strategies, like Gluon does, as no single graph partitioning strategy is best for all cluster sizes. For such systems, a decision tree for selecting a good partitioning strategy based on characteristics of the computation and the cluster is presented.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kumar:2018:UDG, author = "K. Ashwin Kumar and Petros Efstathopoulos", title = "Utility-driven graph summarization", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "4", pages = "335--347", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3297753.3297755", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A lot of the large datasets analyzed today represent graphs. In many real-world applications, summarizing large graphs is beneficial (or necessary) so as to reduce a graph's size and, thus, achieve a number of benefits, including but not limited to (1) significant speed-up for graph algorithms, (2) graph storage space reduction, (3) faster network transmission, (4) improved data privacy, (5) more effective graph visualization, etc. During the summarization process, potentially useful information is removed from the graph (nodes and edges are removed or transformed). Consequently, one important problem with graph summarization is that, although it reduces the size of the input graph, it also adversely affects and reduces its utility. The key question that we pose in this paper is, can we summarize and compress a graph while ensuring that its utility or usefulness does not drop below a certain user-specified utility threshold? We explore this question and propose a novel iterative utility-driven graph summarization approach. During iterative summarization, we incrementally keep track of the utility of the graph summary. This enables a user to query a graph summary that is conditioned on a user-specified utility value. We present both exhaustive and scalable approaches for implementing our proposed solution. Our experimental results on real-world graph datasets show the effectiveness of our proposed approach. Finally, through multiple real-world applications we demonstrate the practicality of our notion of utility of the computed graph summary.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kara:2018:CCS, author = "Kaan Kara and Ken Eguro and Ce Zhang and Gustavo Alonso", title = "{ColumnML}: column-store machine learning with on-the-fly data transformation", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "4", pages = "348--361", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3297753.3297756", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The ability to perform machine learning (ML) tasks in a database management system (DBMS) provides the data analyst with a powerful tool. Unfortunately, integration of ML into a DBMS is challenging for reasons varying from differences in execution model to data layout requirements. In this paper, we assume a column-store main-memory DBMS, optimized for online analytical processing, as our initial system. On this system, we explore the integration of coordinate-descent based methods working natively on columnar format to train generalized linear models. We use a cache-efficient, partitioned stochastic coordinate descent algorithm providing linear throughput scalability with the number of cores while preserving convergence quality, up to 14 cores in our experiments. Existing column oriented DBMS rely on compression and even encryption to store data in memory. When those features are considered, the performance of a CPU based solution suffers. Thus, in the paper we also show how to exploit hardware acceleration as part of a hybrid CPU+FPGA system to provide on-the-fly data transformation combined with an FPGA-based coordinate-descent engine. The resulting system is a column-store DBMS with its important features preserved (e.g., data compression) that offers high performance machine learning capabilities.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2018:CED, author = "Yanying Li and Haipei Sun and Boxiang Dong and Hui (Wendy) Wang", title = "Cost-efficient data acquisition on online data marketplaces for correlation analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "4", pages = "362--375", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3297753.3297757", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Incentivized by the enormous economic profits, the data marketplace platform has been proliferated recently. In this paper, we consider the data marketplace setting where a data shopper would like to buy data instances from the data marketplace for correlation analysis of certain attributes. We assume that the data in the marketplace is dirty and not free. The goal is to find the data instances from a large number of datasets in the marketplace whose join result not only is of high-quality and rich join informativeness, but also delivers the best correlation between the requested attributes. To achieve this goal, we design DANCE, a middleware that provides the desired data acquisition service. DANCE consists of two phases: (1) In the off-line phase, it constructs a two-layer join graph from samples. The join graph includes the information of the datasets in the marketplace at both schema and instance levels; (2) In the online phase, it searches for the data instances that satisfy the constraints of data quality, budget, and join informativeness, while maximizing the correlation of source and target attribute sets. We prove that the complexity of the search problem is NP-hard, and design a heuristic algorithm based on Markov chain Monte Carlo (MCMC). Experiment results on two benchmark and one real datasets demonstrate the efficiency and effectiveness of our heuristic data acquisition algorithm.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dolatshah:2018:CCL, author = "Mohamad Dolatshah and Mathew Teoh and Jiannan Wang and Jian Pei", title = "Cleaning crowdsourced labels using oracles for statistical classification", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "4", pages = "376--389", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3297753.3297758", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Nowadays, crowdsourcing is being widely used to collect training data for solving classification problems. However, crowdsourced labels are often noisy, and there is a performance gap between classification with noisy labels and classification with ground-truth labels. In this paper, we consider how to apply oracle-based label cleaning to reduce the gap. We propose TARS, a label-cleaning advisor that can provide two pieces of valuable advice for data scientists when they need to train or test a model using noisy labels. Firstly, in the model testing stage, given a test dataset with noisy labels, and a classification model, TARS can use the test data to estimate how well the model will perform w.r.t. ground-truth labels. Secondly, in the model training stage, given a training dataset with noisy labels, and a classification algorithm, TARS can determine which label should be sent to an oracle to clean such that the model can be improved the most. For the first advice, we propose an effective estimation technique, and study how to compute confidence intervals to bound its estimation error. For the second advice, we propose a novel cleaning strategy along with two optimization techniques, and illustrate that it is superior to the existing cleaning strategies. We evaluate TARS on both simulated and real-world datasets. The results show that (1) TARS can use noisy test data to accurately estimate a model's true performance for various evaluation metrics; and (2) TARS can improve the model accuracy by a larger margin than the existing cleaning strategies, for the same cleaning budget.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lissandrini:2018:BMM, author = "Matteo Lissandrini and Martin Brugnara and Yannis Velegrakis", title = "Beyond macrobenchmarks: microbenchmark-based graph database evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "4", pages = "390--403", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3297753.3297759", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Despite the increasing interest in graph databases their requirements and specifications are not yet fully understood by everyone, leading to a great deal of variation in the supported functionalities and the achieved performances. In this work, we provide a comprehensive study of the existing graph database systems. We introduce a novel microbenchmarking framework that provides insights on their performance that go beyond what macro-benchmarks can offer. The framework includes the largest set of queries and operators so far considered. The graph database systems are evaluated on synthetic and real data, from different domains, and at scales much larger than any previous work. The framework is materialized as an open-source suite and is easily extended to new datasets, systems, and queries$^1$.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Balegas:2018:IIP, author = "Valter Balegas and S{\'e}rgio Duarte and Carla Ferreira and Rodrigo Rodrigues and Nuno Pregui{\c{c}}a", title = "{IPA}: invariant-preserving applications for weakly consistent replicated databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "4", pages = "404--418", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3297753.3297760", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "It is common to use weakly consistent replication to achieve high availability and low latency at a global scale. In this setting, concurrent updates may lead to states where application invariants do not hold. Some systems coordinate the execution of (conflicting) operations to avoid invariant violations, leading to high latency and reduced availability for those operations. This problem is worsened by the difficulty in identifying precisely which operations conflict. In this paper we propose a novel approach to preserve application invariants without coordinating the execution of operations. The approach consists of modifying operations in a way that application invariants are maintained in the presence of concurrent updates. When no conflicting updates occur, the modified operations present their original semantics. Otherwise, we use sensible and deterministic conflict resolution policies that preserve the invariants of the application. To implement this approach, we developed a static analysis, IPA, that identifies conflicting operations and proposes the necessary modifications to operations. Our analysis shows that IPA can avoid invariant violations in many applications, including typical database applications. Our evaluation reveals that the offline static analysis runs fast enough for being used with large applications. The overhead introduced in the modified operations is low and it leads to lower latency and higher throughput when compared with other approaches that enforce invariants.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abuzaid:2018:DRI, author = "Firas Abuzaid and Peter Kraft and Sahaana Suri and Edward Gan and Eric Xu and Atul Shenoy and Asvin Ananthanarayan and John Sheu and Erik Meijer and Xi Wu and Jeff Naughton and Peter Bailis and Matei Zaharia", title = "{DIFF}: a relational interface for large-scale data explanation", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "4", pages = "419--432", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3297753.3297761", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A range of explanation engines assist data analysts by performing feature selection over increasingly high-volume and high-dimensional data, grouping and highlighting commonalities among data points. While useful in diverse tasks such as user behavior analytics, operational event processing, and root cause analysis, today's explanation engines are designed as standalone data processing tools that do not interoperate with traditional, SQL-based analytics workflows; this limits the applicability and extensibility of these engines. In response, we propose the DIFF operator, a relational aggregation operator that unifies the core functionality of these engines with declarative relational query processing. We implement both single-node and distributed versions of the DIFF operator in MB SQL, an extension of MacroBase, and demonstrate how DIFF can provide the same semantics as existing explanation engines while capturing a broad set of production use cases in industry, including at Microsoft and Facebook. Additionally, we illustrate how this declarative approach to data explanation enables new logical and physical query optimizations. We evaluate these optimizations on several real-world production applications, and find that DIFF in MB SQL can outperform state-of-the-art engines by up to an order of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Basat:2018:SFI, author = "Ran {Ben Basat} and Roy Friedman and Rana Shahout", title = "Stream frequency over interval queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "4", pages = "433--445", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3297753.3297762", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Stream frequency measurements are fundamental in many data stream applications such as financial data trackers, intrusion-detection systems, and network monitoring. Typically, recent data items are more relevant than old ones, a notion we can capture through a sliding window abstraction. This paper considers a generalized sliding window model that supports stream frequency queries over an interval given at query time. This enables drill-down queries, in which we can examine the behavior of the system in finer and finer granularities. For this model, we asymptotically improve the space bounds of existing work, reduce the update and query time to a constant, and provide deterministic solutions. When evaluated over real Internet packet traces, our fastest algorithm processes items 90--250 times faster, serves queries at least 730 times quicker and consumes at least 40\% less space than the best known method.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xin:2018:HHO, author = "Doris Xin and Stephen Macke and Litian Ma and Jialin Liu and Shuchen Song and Aditya Parameswaran", title = "{HELIX}: holistic optimization for accelerating iterative machine learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "4", pages = "446--460", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3297753.3297763", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Machine learning workflow development is a process of trial-and-error: developers iterate on workflows by testing out small modifications until the desired accuracy is achieved. Unfortunately, existing machine learning systems focus narrowly on model training---a small fraction of the overall development time---and neglect to address iterative development. We propose H elix, a machine learning system that optimizes the execution across iterations ---intelligently caching and reusing, or recomputing intermediates as appropriate. Helix captures a wide variety of application needs within its Scala DSL, with succinct syntax defining unified processes for data preprocessing, model specification, and learning. We demonstrate that the reuse problem can be cast as a Max-Flow problem, while the caching problem is NP-Hard. We develop effective lightweight heuristics for the latter. Empirical evaluation shows that Helix is not only able to handle a wide variety of use cases in one unified workflow but also much faster, providing run time reductions of up to 19x over state-of-the-art systems, such as DeepDive or KeystoneML, on four real-world applications in natural language processing, computer vision, social and natural sciences.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fu:2019:FAN, author = "Cong Fu and Chao Xiang and Changxu Wang and Deng Cai", title = "Fast approximate nearest neighbor search with the navigating spreading-out graph", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "5", pages = "461--474", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3303753.3303754", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Approximate nearest neighbor search (ANNS) is a fundamental problem in databases and data mining. A scalable ANNS algorithm should be both memory-efficient and fast. Some early graph-based approaches have shown attractive theoretical guarantees on search time complexity, but they all suffer from the problem of high indexing time complexity. Recently, some graph-based methods have been proposed to reduce indexing complexity by approximating the traditional graphs; these methods have achieved revolutionary performance on million-scale datasets. Yet, they still can not scale to billion-node databases. In this paper, to further improve the search-efficiency and scalability of graph-based methods, we start by introducing four aspects: (1) ensuring the connectivity of the graph; (2) lowering the average out-degree of the graph for fast traversal; (3) shortening the search path; and (4) reducing the index size. Then, we propose a novel graph structure called Monotonic Relative Neighborhood Graph (MRNG) which guarantees very low search complexity (close to logarithmic time). To further lower the indexing complexity and make it practical for billion-node ANNS problems, we propose a novel graph structure named Navigating Spreading-out Graph (NSG) by approximating the MRNG. The NSG takes the four aspects into account simultaneously. Extensive experiments show that NSG outperforms all the existing algorithms significantly. In addition, NSG shows superior performance in the E-commercial scenario of Taobao (Alibaba Group) and has been integrated into their billion-scale search engine.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2019:DRF, author = "Qi Wang and Torsten Suel", title = "Document reordering for faster intersection", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "5", pages = "475--487", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3303753.3303755", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A lot of research has studied how to optimize inverted index structures in search engines through suitable reassignment of document identifiers. This approach was originally proposed to allow for better compression of the index, but subsequent work showed that it can also result in significant speed-ups for conjunctive queries and even certain types of disjunctive top-k algorithms. However, we do not have a good understanding of why this happens, and how we could directly optimize an index for query processing speed. As a result, existing techniques attempt to optimize for size, and treat speed increases as a welcome side-effect. In this paper, we take an initial but important step towards understanding and modeling speed increases due to document reordering. We define the problem of minimizing the cost of queries given an inverted index and a query distribution, relate it to work on adaptive set intersection, and show that it is fundamentally different from that of minimizing compressed index size. We then propose a heuristic algorithm for finding a document reordering that minimizes query processing costs under suitable cost models. Our experiments show significant increases in the speed of intersections over state-of-the-art reordering techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2019:CCS, author = "Xiaofei Zhang and M. Tamer {\"O}zsu", title = "Correlation constraint shortest path over large multi-relation graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "5", pages = "488--501", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3303753.3303756", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Multi-relation graphs intuitively capture the heterogeneous correlations among real-world entities by allowing multiple types of relationships to be represented as entity-connecting edges, i.e., two entities could be correlated with more than one type of relationship. This is important in various applications such as social network analysis, ecology, and bio-informatics. Existing studies on these graphs usually consider an edge label constraint perspective, where each edge contains only one label and each edge is considered independently. For example, there are lines of research focusing on reachability between two vertices under a set of edge label constraints, or finding paths whose consecutive edge labels satisfy a user-specified logical expression. This is too restricted in real graphs, and in this work, we define a generic correlation constraint on multi-relation graphs from the perspective of vertex correlations, where a correlation can be defined recursively. Specifically, we formalize and investigate the shortest path problem over large multi-relation graphs in the presence of both necessity and denial constraints, which have various real applications. We show that it is nontrivial to apply conventional graph traversal algorithms (e.g., BFS or DFS) to address the challenge. To effectively reduce the search space, we propose a Hybrid Relation Encoding method, a.k.a. HyRE, to encode both topological and relation information in a compact way. We conduct extensive experiments over large real-world graphs to validate the effectiveness and efficiency of the proposed solution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lang:2019:POF, author = "Harald Lang and Thomas Neumann and Alfons Kemper and Peter Boncz", title = "Performance-optimal filtering: {Bloom} overtakes {Cuckoo} at high throughput", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "5", pages = "502--515", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3303753.3303757", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We define the concept of performance-optimal filtering to indicate the Bloom or Cuckoo filter configuration that best accelerates a particular task. While the space-precision tradeoff of these filters has been well studied, we show how to pick a filter that maximizes the performance for a given workload. This choice might be ``suboptimal'' relative to traditional space-precision metrics, but it will lead to better performance in practice. In this paper, we focus on high-throughput filter use cases, aimed at avoiding CPU work, e.g., a cache miss, a network message, or a local disk I/O --- events that can happen at rates of millions to hundreds per second. Besides the false-positive rate and memory footprint of the filter, performance optimality has to take into account the absolute cost of the filter lookup as well as the saved work per lookup that filtering avoids; while the actual rate of negative lookups in the workload determines whether using a filter improves overall performance at all. In the course of the paper, we introduce new filter variants, namely the register-blocked and cache-sectorized Bloom filters. We present new implementation techniques and perform an extensive evaluation on modern hardware platforms, including the wide-SIMD Skylake-X and Knights Landing. This experimentation shows that in high-throughput situations, the lower lookup cost of blocked Bloom filters allows them to overtake Cuckoo filters.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zeuch:2019:AES, author = "Steffen Zeuch and Bonaventura {Del Monte} and Jeyhun Karimov and Clemens Lutz and Manuel Renz and Jonas Traub and Sebastian Bre{\ss} and Tilmann Rabl and Volker Markl", title = "Analyzing efficient stream processing on modern hardware", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "5", pages = "516--530", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3303753.3303758", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern Stream Processing Engines (SPEs) process large data volumes under tight latency constraints. Many SPEs execute processing pipelines using message passing on shared-nothing architectures and apply a partition-based scale-out strategy to handle high-velocity input streams. Furthermore, many state-of-the-art SPEs rely on a Java Virtual Machine to achieve platform independence and speed up system development by abstracting from the underlying hardware. In this paper, we show that taking the underlying hardware into account is essential to exploit modern hardware efficiently. To this end, we conduct an extensive experimental analysis of current SPEs and SPE design alternatives optimized for modern hardware. Our analysis highlights potential bottlenecks and reveals that state-of-the-art SPEs are not capable of fully exploiting current and emerging hardware trends, such as multi-core processors and high-speed networks. Based on our analysis, we describe a set of design changes to the common architecture of SPEs to scale-up on modern hardware. We show that the single-node throughput can be increased by up to two orders of magnitude compared to state-of-the-art SPEs by applying specialized code generation, fusing operators, batch-style parallelization strategies, and optimized windowing. This speedup allows for deploying typical streaming applications on a single or a few nodes instead of large clusters.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Luo:2019:EDI, author = "Chen Luo and Michael J. Carey", title = "Efficient data ingestion and query processing for {LSM}-based storage systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "5", pages = "531--543", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3303753.3303759", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In recent years, the Log Structured Merge (LSM) tree has been widely adopted by NoSQL and NewSQL systems for its superior write performance. Despite its popularity, however, most existing work has focused on LSM-based key--value stores with only a single LSM-tree; auxiliary structures, which are critical for supporting ad-hoc queries, have received much less attention. In this paper, we focus on efficient data ingestion and query processing for general-purpose LSM-based storage systems. We first propose and evaluate a series of optimizations for efficient batched point lookups, significantly improving the range of applicability of LSM-based secondary indexes. We then present several new and efficient maintenance strategies for LSM-based storage systems. Finally, we have implemented and experimentally evaluated the proposed techniques in the context of the Apache AsterixDB system, and we present the results here.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chrysogelos:2019:HEH, author = "Periklis Chrysogelos and Manos Karpathiotakis and Raja Appuswamy and Anastasia Ailamaki", title = "{HetExchange}: encapsulating heterogeneous {CPU--GPU} parallelism in {JIT} compiled engines", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "5", pages = "544--556", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3303753.3303760", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern server hardware is increasingly heterogeneous as hardware accelerators, such as GPUs, are used together with multicore CPUs to meet the computational demands of modern data analytics work-loads. Unfortunately, query parallelization techniques used by analytical database engines are designed for homogeneous multicore servers, where query plans are parallelized across CPUs to process data stored in cache coherent shared memory. Thus, these techniques are unable to fully exploit available heterogeneous hardware, where one needs to exploit task-parallelism of CPUs and data-parallelism of GPUs for processing data stored in a deep, non-cache-coherent memory hierarchy with widely varying access latencies and bandwidth. In this paper, we introduce HetExchange-a parallel query execution framework that encapsulates the heterogeneous parallelism of modern multi-CPU-multi-GPU servers and enables the parallelization of (pre-)existing sequential relational operators. In contrast to the interpreted nature of traditional Exchange, HetExchange is designed to be used in conjunction with JIT compiled engines in order to allow a tight integration with the proposed operators and generation of efficient code for heterogeneous hardware. We validate the applicability and efficiency of our design by building a prototype that can operate over both CPUs and GPUs, and enables its operators to be parallelism- and data-location-agnostic. In doing so, we show that efficiently exploiting CPU-GPU parallelism can provide 2.8x and 6.4x improvement in performance compared to state-of-the-art CPU-based and GPU-based DBMS.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Atzeni:2019:MMS, author = "Paolo Atzeni and Luigi Bellomarini and Paolo Papotti and Riccardo Torlone", title = "Meta-mappings for schema mapping reuse", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "5", pages = "557--569", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3303753.3303761", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The definition of mappings between heterogeneous schemas is a critical activity of any database application. Existing tools provide high level interfaces for the discovery of correspondences between elements of schemas, but schema mappings need to be manually specified every time from scratch, even if the scenario at hand is similar to one that has already been addressed. The problem is that schema mappings are precisely defined over a pair of schemas and cannot directly be reused on different scenarios. We tackle this challenge by generalizing schema mappings as meta-mappings: formalisms that describe transformations between generic data structures called meta-schemas. We formally characterize schema mapping reuse and explain how meta-mappings are able to: (i) capture enterprise knowledge from previously defined schema mappings and (ii) use this knowledge to suggest new mappings. We develop techniques to infer meta-mappings from existing mappings, to organize them into a searchable repository, and to leverage the repository to propose to users mappings suitable for their needs. We study effectiveness and efficiency in an extensive evaluation over real-world scenarios and show that our system can infer, store, and search millions of meta-mappings in seconds.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xu:2019:EEG, author = "Lijie Xu and Tian Guo and Wensheng Dou and Wei Wang and Jun Wei", title = "An experimental evaluation of garbage collectors on big data applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "5", pages = "570--583", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3303753.3303762", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Popular big data frameworks, ranging from Hadoop MapReduce to Spark, rely on garbage-collected languages, such as Java and Scala. Big data applications are especially sensitive to the effectiveness of garbage collection (i.e., GC), because they usually process a large volume of data objects that lead to heavy GC overhead. Lacking in-depth understanding of GC performance has impeded performance improvement in big data applications. In this paper, we conduct the first comprehensive evaluation on three popular garbage collectors, i.e., Parallel, CMS, and G1, using four representative Spark applications. By thoroughly investigating the correlation between these big data applications' memory usage patterns and the collectors' GC patterns, we obtain many findings about GC inefficiencies. We further propose empirical guidelines for application developers, and insightful optimization strategies for designing big-data-friendly garbage collectors.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Guo:2019:AOC, author = "Jinwei Guo and Peng Cai and Jiahao Wang and Weining Qian and Aoying Zhou", title = "Adaptive optimistic concurrency control for heterogeneous workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "5", pages = "584--596", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3303753.3303763", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Optimistic concurrency control (OCC) protocols validate whether a transaction has conflicts with other concurrent transactions after this transaction completes its execution. In this work, we demonstrate that the validation phase has a great influence on the performance of modern in-memory database systems, especially under heterogeneous workloads. The cost of validating operations in a transaction is determined by two main factors. The first factor is the operation type. An OCC protocol would take much less cost on validating a single-record read operation than validating a key-range scan operation. The second factor is the workload type. Existing schemes in OCC variants for validating key-range scan perform differently under various workloads. Although various validation schemes share the same goal of guaranteeing a transaction schedule to be serializable, there are remarkable differences between the costs they introduced. These observations motivate us to design an optimistic concurrency control which can choose a low-cost validation scheme at runtime, referred to as adaptive optimistic concurrency control (AOCC). First, at transaction-level granularity, AOCC can assign a validation method to a transaction according to the features of its operations. Furthermore, for each operation in a transaction, the validation method is selected according to not only the number of accessed records but also the instant characteristics of workloads. Experimental results show that AOCC has good performance and scalability under heterogeneous workloads mixed with point accesses and predicate queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lin:2019:MTC, author = "Yu-Shan Lin and Shao-Kan Pi and Meng-Kai Liao and Ching Tsai and Aaron Elmore and Shan-Hung Wu", title = "{MgCrab}: transaction crabbing for live migration in deterministic database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "5", pages = "597--610", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3303753.3303764", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent deterministic database systems have achieved high scalability and high availability in distributed environments given OLTP workloads. However, modern OLTP applications usually have changing workloads or access patterns, so how to make the resource provisioning elastic to the changing workloads becomes an important design goal for a deterministic database system. Live migration, which moves the specified data from a source machine to a destination node while continuously serving the incoming transactions, is a key technique required for the elasticity. In this paper, we present MgCrab, a live migration technique for a deterministic database system, that leverages the determinism to maintain the consistency of data on the source and destination nodes at very low cost during a migration period. We implement MgCrab on an open-source database system. Extensive experiments were conducted and the results demonstrate the effectiveness of MgCrab.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Maiyya:2019:UCA, author = "Sujaya Maiyya and Faisal Nawab and Divyakant Agrawal and Amr {El Abbadi}", title = "Unifying consensus and atomic commitment for effective cloud data management", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "5", pages = "611--623", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3303753.3303765", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Feb 27 14:03:31 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", note = "See errata \cite{Maiyya:2021:EUC}.", abstract = "Data storage in the Cloud needs to be scalable and fault-tolerant. Atomic commitment protocols such as Two Phase Commit (2PC) provide ACID guarantees for transactional access to sharded data and help in achieving scalability. Whereas consensus protocols such as Paxos consistently replicate data across different servers and provide fault tolerance. Cloud based datacenters today typically treat the problems of scalability and fault-tolerance disjointedly. In this work, we propose a unification of these two different paradigms into one framework called Consensus and Commitment (C\&C) framework. The C\&C framework can model existing and well known data management protocols as well as propose new ones. We demonstrate the advantages of the C\&C framework by developing a new atomic commitment protocol, Paxos Atomic Commit (PAC), which integrates commitment with recovery in a Paxos-like manner. We also instantiate commit protocols from the C\&C framework catered to different Cloud data management techniques. In particular, we propose a novel protocol, Generalized PAC (G-PAC) that integrates atomic commitment and fault tolerance in a cloud paradigm involving both sharding and replication of data. We compare the performance of G-PAC with a Spanner-like protocol, where 2PC is used at the logical data level and Paxos is used for consistent replication of logical data. The experimental results highlight the benefits of combining consensus along with commitment into a single integrated protocol.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2019:ATC, author = "Chenggang Wu and Vikram Sreekanti and Joseph M. Hellerstein", title = "Autoscaling tiered cloud storage in {Anna}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "6", pages = "624--638", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3311880.3311881", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 20 17:32:19 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we describe how we extended a distributed key--value store called Anna into an autoscaling, multi-tier service for the cloud. In its extended form, Anna is designed to overcome the narrow cost-performance limitations typical of current cloud storage systems. We describe three key aspects of Anna's new design: multi-master selective replication of hot keys, a vertical tiering of storage layers with different cost-performance tradeoffs, and horizontal elasticity of each tier to add and remove nodes in response to load dynamics. Anna's policy engine uses these mechanisms to balance service-level objectives around cost, latency and fault tolerance. Experimental results explore the behavior of Anna's mechanisms and policy, exhibiting orders of magnitude efficiency improvements over both commodity cloud KVS services and research systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dignos:2019:SST, author = "Anton Dign{\"o}s and Boris Glavic and Xing Niu and Michael B{\"o}hlen and Johann Gamper", title = "Snapshot semantics for temporal multiset relations", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "6", pages = "639--652", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3311880.3311882", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 20 17:32:19 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Snapshot semantics is widely used for evaluating queries over temporal data: temporal relations are seen as sequences of snapshot relations, and queries are evaluated at each snapshot. In this work, we demonstrate that current approaches for snapshot semantics over interval-timestamped multiset relations are subject to two bugs regarding snapshot aggregation and bag difference. We introduce a novel temporal data model based on K -relations that overcomes these bugs and prove it to correctly encode snapshot semantics. Furthermore, we present an efficient implementation of our model as a database middleware and demonstrate experimentally that our approach is competitive with native implementations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kwashie:2019:CEE, author = "Selasi Kwashie and Lin Liu and Jixue Liu and Markus Stumptner and Jiuyong Li and Lujing Yang", title = "{Certus}: an effective entity resolution approach with graph differential dependencies {(GDDs)}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "6", pages = "653--666", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3311880.3311883", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 20 17:32:19 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Entity resolution (ER) is the problem of accurately identifying multiple, differing, and possibly contradicting representations of unique real-world entities in data. It is a challenging and fundamental task in data cleansing and data integration. In this work, we propose graph differential dependencies (GDDs) as an extension of the recently developed graph entity dependencies (which are formal constraints for graph data) to enable approximate matching of values. Furthermore, we investigate a special discovery of GDDs for ER by designing an algorithm for generating a non-redundant set of GDDs in labelled data. Then, we develop an effective ER technique, Certus, that employs the learned GDDs for improving the accuracy of ER results. We perform extensive empirical evaluation of our proposals on five real-world ER benchmark datasets and a proprietary database to test their effectiveness and efficiency. The results from the experiments show the discovery algorithm and Certus are efficient; and more importantly, GDDs significantly improve the precision of ER without considerable trade-off of recall.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Han:2019:EEA, author = "Kai Han and Fei Gui and Xiaokui Xiao and Jing Tang and Yuntian He and Zongmai Cao and He Huang", title = "Efficient and effective algorithms for clustering uncertain graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "6", pages = "667--680", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3311880.3311884", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 20 17:32:19 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We consider the edge uncertainty in an undirected graph and study the k -median (resp. k -center) problems, where the goal is to partition the graph nodes into k clusters such that the average (resp. minimum) connection probability between each node and its cluster's center is maximized. We analyze the hardness of these problems, and propose algorithms that provide considerably improved approximation guarantees than the existing studies do. Specifically, our algorithms offer (1 --- 1/e)-approximations for the k -median problem and (OPTck)-approximations for the k -center problem, where OPTck is the optimal objective function value for k -center. In addition, our algorithms incorporate several non-trivial optimizations that significantly enhance their practical efficiency. Extensive experimental results demonstrate that our algorithms considerably outperform the existing methods on both computation efficiency and the quality of clustering results.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zou:2019:PMD, author = "Jia Zou and Arun Iyengar and Chris Jermaine", title = "{Pangea}: monolithic distributed storage for data analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "6", pages = "681--694", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3311880.3311885", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 20 17:32:19 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Storage and memory systems for modern data analytics are heavily layered, managing shared persistent data, cached data, and nonshared execution data in separate systems such as a distributed file system like HDFS, an in-memory file system like Alluxio, and a computation framework like Spark. Such layering introduces significant performance and management costs. In this paper we propose a single system called Pangea that can manage all data---both intermediate and long-lived data, and their buffer/caching, data placement optimization, and failure recovery---all in one monolithic distributed storage system, without any layering. We present a detailed performance evaluation of Pangea and show that its performance compares favorably with several widely used layered systems such as Spark.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2019:SMD, author = "Zhiwei Fan and Jianqiao Zhu and Zuyu Zhang and Aws Albarghouthi and Paraschos Koutris and Jignesh M. Patel", title = "Scaling-up in-memory datalog processing: observations and techniques", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "6", pages = "695--708", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3311880.3311886", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 20 17:32:19 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recursive query processing has experienced a recent resurgence, as a result of its use in many modern application domains, including data integration, graph analytics, security, program analysis, networking and decision making. Due to the large volumes of data being processed, several research efforts across multiple communities have explored how to scale up recursive queries, typically expressed in Datalog. Our experience with these tools indicate that their performance does not translate across domains---e.g., a tool designed for large-scale graph analytics does not exhibit the same performance on program-analysis tasks, and vice versa. Starting from the above observation, we make the following two contributions. First, we perform a detailed experimental evaluation comparing a number of state-of-the-art Datalog systems on a wide spectrum of graph analytics and program-analysis tasks, and summarize the pros and cons of existing techniques. Second, we design and implement our own general-purpose Datalog engine, called RecStep, on top of a parallel single-node relational system. We outline the techniques we applied on RecStep, as well as the contribution of each technique to the overall performance. Using RecStep as a baseline, we demonstrate that it generally out-performs state-of-the-art parallel Datalog engines on complex and large-scale Datalog evaluation, by a 4-6X margin. An additional insight from our work is that it is possible to build a high-performance Datalog system on top of a relational engine, an idea that has been dismissed in past work.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Archer:2019:CAL, author = "Aaron Archer and Kevin Aydin and Mohammad Hossein Bateni and Vahab Mirrokni and Aaron Schild and Ray Yang and Richard Zhuang", title = "Cache-aware load balancing of data center applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "6", pages = "709--723", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3311880.3311887", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 20 17:32:19 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Our deployment of cache-aware load balancing in the Google web search backend reduced cache misses by $ \approx $0.5x, contributing to a double-digit percentage increase in the throughput of our serving clusters by relieving a bottleneck. This innovation has benefited all production workloads since 2015, serving billions of queries daily. A load balancer forwards each query to one of several identical serving replicas. The replica pulls each term's postings list into RAM from flash, either locally or over the network. Flash bandwidth is a critical bottleneck, motivating an application-directed RAM cache on each replica. Sending the same term reliably to the same replica would increase the chance it hits cache, and avoid polluting the other replicas' caches. However, most queries contain multiple terms and we have to send the whole query to one replica, so it is not possible to achieve a perfect partitioning of terms to replicas. We solve this via a voting scheme, whereby the load balancer conducts a weighted vote by the terms in each query, and sends the query to the winning replica. We develop a multi-stage scalable algorithm to learn these weights. We first construct a large-scale term-query graph from logs and apply a distributed balanced graph partitioning algorithm to cluster each term to a preferred replica. This yields a good but simplistic initial voting table, which we then iteratively refine via cache simulation to capture feedback effects.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Borkowski:2019:MCR, author = "Michael Borkowski and Christoph Hochreiner and Stefan Schulte", title = "Minimizing cost by reducing scaling operations in distributed stream processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "7", pages = "724--737", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3317315.3317316", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 20 17:32:19 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Elastic distributed stream processing systems are able to dynamically adapt to changes in the workload. Often, these systems react to the rate of incoming data, or to the level of resource utilization, by scaling up or down. The goal is to optimize the system's resource usage, thereby reducing its operational cost. However, such scaling operations consume resources on their own, introducing a certain overhead of resource usage, and therefore cost, for every scaling operation. In addition, migrations caused by scaling operations inevitably lead to brief processing gaps. Therefore, an excessive number of scaling operations should be avoided. We approach this problem by preventing unnecessary scaling operations and over-compensating reactions to short-term changes in the workload. This allows to maintain elasticity, while also minimizing the incurred overhead cost of scaling operations. To achieve this, we use advanced filtering techniques from the field of signal processing to pre-process raw system measurements, thus mitigating superfluous scaling operations. We perform a real-world testbed evaluation verifying the effects, and provide a break-even cost analysis to show the economic feasibility of our approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2019:PPB, author = "Yinjun Wu and Abdussalam Alawini and Daniel Deutch and Tova Milo and Susan Davidson", title = "{ProvCite}: provenance-based data citation", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "7", pages = "738--751", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3317315.3317317", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 20 17:32:19 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As research products expand to include structured datasets, the challenge arises of how to automatically generate citations to the results of arbitrary queries against such datasets. Previous work explored this problem in the context of conjunctive queries and views using a Rewriting-Based Model (RBM). However, an increasing number of scientific queries are aggregate, e.g. statistical summaries of the underlying data, for which the RBM cannot be easily extended. In this paper, we show how a Provenance-Based Model (PBM) can be leveraged to (1) generate citations to conjunctive as well as aggregate queries and views; (2) associate citations with individual result tuples to enable arbitrary subsets of the result set to be cited (fine-grained citations); and (3) be optimized to return citations in acceptable time. Our implementation of PBM in ProvCite shows that it not only handles a larger class of queries and views than RBM, but can outperform it when restricted to conjunctive views in some cases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2019:DCF, author = "Wenfei Fan and Ping Lu and Chao Tian and Jingren Zhou", title = "Deducing certain fixes to graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "7", pages = "752--765", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3317315.3317318", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 20 17:32:19 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper proposes to deduce certain fixes to graphs G based on data quality rules \Sigma and ground truth \Gamma ( i.e., validated attribute values and entity matches). We fix errors detected by \Sigma in G such that the fixes are assured correct as long as \Sigma and \Gamma are correct. We deduce certain fixes in two paradigms. (a) We interact with users and ``incrementally'' fix errors online. Whenever users pick a small set V$_0$ of nodes in G, we fix all errors pertaining to V$_0$ and accumulate ground truth in the process. (b) Based on accumulated \Gamma, we repair the entire graph G offline; while this may not correct all errors in G, all fixes are guaranteed certain. We develop techniques for deducing certain fixes. (1) We define data quality rules to support conditional functional dependencies, recursively defined keys and negative rules on graphs, such that we can deduce fixes by combining data repairing and object identification. (2) We show that deducing certain fixes is Church--Rosser, i.e., the deduction converges at the same fixes regardless of the order of rules applied. (3) We establish the complexity of three fundamental problems associated with certain fixes. (4) We provide (parallel) algorithms for deducing certain fixes online and offline, and guarantee to reduce running time when given more processors. Using real-life and synthetic data, we experimentally verify the effectiveness and scalability of our methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ceccarello:2019:SCC, author = "Matteo Ceccarello and Andrea Pietracaprina and Geppino Pucci", title = "Solving $k$-center clustering (with outliers) in {MapReduce} and streaming, almost as accurately as sequentially", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "7", pages = "766--778", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3317315.3317319", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 20 17:32:19 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Center-based clustering is a fundamental primitive for data analysis and becomes very challenging for large datasets. In this paper, we focus on the popular k center variant which, given a set S of points from some metric space and a parameter k < | S |, requires to identify a subset of k centers in S minimizing the maximum distance of any point of S from its closest center. A more general formulation, introduced to deal with noisy datasets, features a further parameter z and allows up to z points of S (outliers) to be disregarded when computing the maximum distance from the centers. We present coreset-based 2-round MapReduce algorithms for the above two formulations of the problem, and a 1-pass Streaming algorithm for the case with outliers. For any fixed \&\#1013; \> 0, the algorithms yield solutions whose approximation ratios are a mere additive term \&\#1013; away from those achievable by the best known polynomial-time sequential algorithms, a result that substantially improves upon the state of the art. Our algorithms are rather simple and adapt to the intrinsic complexity of the dataset, captured by the doubling dimension D of the metric space. Specifically, our analysis shows that the algorithms become very space-efficient for the important case of small (constant) D. These theoretical results are complemented with a set of experiments on real-world and synthetic datasets of up to over a billion points, which show that our algorithms yield better quality solutions over the state of the art while featuring excellent scalability, and that they also lend themselves to sequential implementations much faster than existing ones.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2019:EED, author = "Xiaolan Wang and Alexandra Meliou", title = "{Explain$3$D}: explaining disagreements in disjoint datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "7", pages = "779--792", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3317315.3317320", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 20 17:32:19 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data plays an important role in applications, analytic processes, and many aspects of human activity. As data grows in size and complexity, we are met with an imperative need for tools that promote understanding and explanations over data-related operations. Data management research on explanations has focused on the assumption that data resides in a single dataset, under one common schema. But the reality of today's data is that it is frequently unintegrated, coming from different sources with different schemas. When different datasets provide different answers to semantically similar questions, understanding the reasons for the discrepancies is challenging and cannot be handled by the existing single-dataset solutions. In this paper, we propose explain3D, a framework for explaining the disagreements across disjoint datasets (3D). Explain3D focuses on identifying the reasons for the differences in the results of two semantically similar queries operating on two datasets with potentially different schemas. Our framework leverages the queries to perform a semantic mapping across the relevant parts of their provenance; discrepancies in this mapping point to causes of the queries' differences. Exploiting the queries gives explain3D an edge over traditional schema matching and record linkage techniques, which are query-agnostic. Our work makes the following contributions: (1) We formalize the problem of deriving optimal explanations for the differences of the results of semantically similar queries over disjoint datasets. Our optimization problem considers two types of explanations, provenance-based and value-based, defined over an evidence mapping, which makes our solution interpretable. (2) We design a 3-stage framework for solving the optimal explanation problem. (3) We develop a smart-partitioning optimizer that improves the efficiency of the framework by orders of magnitude. (4) We experiment with real-world and synthetic data to demonstrate that explain3D can derive precise explanations efficiently, and is superior to alternative methods based on integration techniques and single-dataset explanation frameworks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Won:2019:DDS, author = "Youjip Won and Sundoo Kim and Juseong Yun and Dam Quang Tuan and Jiwon Seo", title = "{DASH}: database shadowing for mobile {DBMS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "7", pages = "793--806", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3317315.3317321", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 20 17:32:19 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this work, we propose Database Shadowing, or DASH, which is a new crash recovery technique for SQLite DBMS. DASH is a hybrid mixture of classical shadow paging and logging. DASH addresses four major issues in the current SQLite journal modes: the performance and write amplification issues of the rollback mode and the storage space requirement and tail latency issues of the WAL mode. DASH exploits two unique characteristics of SQLite: the database files are small and the transactions are entirely serialized. DASH consists of three key ingredients Aggregate Update, Atomic Exchange and Version Reset. Aggregate Update eliminates the redundant write overhead and the requirement to maintain multiple snapshots both of which are inherent in the out-of-place update. Atomic Exchange resolves the overhead of updating the locations of individual database pages exploiting order-preserving nature of the metadata update operation in modern filesystem. Version Reset makes the result of the Atomic Exchange durable without relying on expensive filesystem journaling. The salient aspect of DASH lies in its simplicity and compatibility with the legacy. DASH does not require any modifications in the underlying filesystem or the database organization. It requires only 451 LOC to implement. In Cyclomatic Complexity score, which represents software complexity, DASH renders 33\% lower (simpler) mark than PERSIST and WAL modes of SQLite. We implement DASH for SQLite on Android and extensively evaluate it on widely used smartphone devices. DASH yields 4x performance gain over PERSIST mode (default journaling mode). Compared to WAL mode (the fastest journaling mode), DASH uses only 2.5\% of the storage space on average. The transaction latency of DASH at 99.9\% is one fourth of that of WAL mode.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2019:AGL, author = "Zeke Wang and Kaan Kara and Hantian Zhang and Gustavo Alonso and Onur Mutlu and Ce Zhang", title = "Accelerating generalized linear models with {MLWeaving}: a one-size-fits-all system for any-precision learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "7", pages = "807--821", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3317315.3317322", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 20 17:32:19 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Learning from the data stored in a database is an important function increasingly available in relational engines. Methods using lower precision input data are of special interest given their overall higher efficiency. However, in databases, these methods have a hidden cost: the quantization of the real value into a smaller number is an expensive step. To address this issue, we present ML-Weaving, a data structure and hardware acceleration technique intended to speed up learning of generalized linear models over low precision data. MLWeaving provides a compact in-memory representation that enables the retrieval of data at any level of precision. MLWeaving also provides a highly efficient implementation of stochastic gradient descent on FPGAs and enables the dynamic tuning of precision, instead of using a fixed precision level during learning. Experimental results show that MLWeaving converges up to 16 x faster than low-precision implementations of first-order methods on CPUs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jankov:2019:DRC, author = "Dimitrije Jankov and Shangyu Luo and Binhang Yuan and Zhuhua Cai and Jia Zou and Chris Jermaine and Zekai J. Gao", title = "Declarative recursive computation on an {RDBMS}: or, why you should use a database for distributed machine learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "7", pages = "822--835", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3317315.3317323", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 20 17:32:19 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A number of popular systems, most notably Google's TensorFlow, have been implemented from the ground up to support machine learning tasks. We consider how to make a very small set of changes to a modern relational database management system (RDBMS) to make it suitable for distributed learning computations. Changes include adding better support for recursion, and optimization and execution of very large compute plans. We also show that there are key advantages to using an RDBMS as a machine learning platform. In particular, learning based on a database management system allows for trivial scaling to large data sets and especially large models, where different computational units operate on different parts of a model that may be too large to fit into RAM.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ghandeharizadeh:2019:DIE, author = "Shahram Ghandeharizadeh and Hieu Nguyen", title = "Design, implementation, and evaluation of write-back policy with cache augmented data stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "8", pages = "836--849", month = apr, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3324301.3324302", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Cache Augmented Data Store (CADS) architecture extends a persistent data store with an in-memory cache manager. It is widely deployed to support read-intensive workloads. However, its write-around and write-through policies prevent the caching tier from absorbing write load. This means the data store layer must scale to process writes even when the extra capacity is not needed for read load. We address this limitation by devising a write-back technique to enable the caching layer to process both reads and writes. This technique preserves ACID transactions. We present a client side implementation of write-back and evaluate it using the YCSB, BG, and TPC-C benchmarks. In addition, we compare our write-back with (a) write-back policy of a data store such as MongoDB and (b) write-back policy of a host-side cache such as Flashcache.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nguyen:2019:UGE, author = "Thanh Tam Nguyen and Matthias Weidlich and Hongzhi Yin and Bolong Zheng and Quoc Viet Hung Nguyen and Bela Stantic", title = "User guidance for efficient fact checking", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "8", pages = "850--863", month = apr, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3324301.3324303", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Web constitutes a valuable source of information. In recent years, it fostered the construction of large-scale knowledge bases, such as Freebase, YAGO, and DBpedia. The open nature of the Web, with content potentially being generated by everyone, however, leads to inaccuracies and misinformation. Construction and maintenance of a knowledge base thus has to rely on fact checking, an assessment of the credibility of facts. Due to an inherent lack of ground truth information, such fact checking cannot be done in a purely automated manner, but requires human involvement. In this paper, we propose a comprehensive framework to guide users in the validation of facts, striving for a minimisation of the invested effort. Our framework is grounded in a novel probabilistic model that combines user input with automated credibility inference. Based thereon, we show how to guide users in fact checking by identifying the facts for which validation is most beneficial. Moreover, our framework includes techniques to reduce the manual effort invested in fact checking by determining when to stop the validation and by supporting efficient batching strategies. We further show how to handle fact checking in a streaming setting. Our experiments with three real-world datasets demonstrate the efficiency and effectiveness of our framework: A knowledge base of high quality, with a precision of above 90\%, is constructed with only a half of the validation effort required by baseline techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ke:2019:DCR, author = "Xiangyu Ke and Arijit Khan and Leroy Lim Hong Quan", title = "An in-depth comparison of $s$--$t$ reliability algorithms over uncertain graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "8", pages = "864--876", month = apr, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3324301.3324304", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Uncertain, or probabilistic, graphs have been increasingly used to represent noisy linked data in many emerging applications, and have recently attracted the attention of the database research community. A fundamental problem on uncertain graphs is the s-t reliability, which measures the probability that a target node t is reachable from a source node s in a probabilistic (or uncertain) graph, i.e., a graph where every edge is assigned a probability of existence. Due to the inherent complexity of the s-t reliability estimation problem (\#P-hard), various sampling and indexing based efficient algorithms were proposed in the literature. However, since they have not been thoroughly compared with each other, it is not clear whether the later algorithm outperforms the earlier ones. More importantly, the comparison framework, datasets, and metrics were often not consistent (e.g., different convergence criteria were employed to find the optimal number of samples) across these works. We address this serious concern by re-implementing six state-of-the-art s-t reliability estimation methods in a common system and code base, using several medium and large-scale, real-world graph datasets, identical evaluation metrics, and query workloads. Through our systematic and in-depth analysis of experimental results, we report surprising findings, such as many follow-up algorithms can actually be several orders of magnitude inefficient, less accurate, and more memory intensive compared to the ones that were proposed earlier. We conclude by discussing our recommendations on the road ahead.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2019:DSP, author = "Wenfei Fan and Chunming Hu and Muyang Liu and Ping Lu and Qiang Yin and Jingren Zhou", title = "Dynamic scaling for parallel graph computations", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "8", pages = "877--890", month = apr, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3324301.3324305", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper studies scaling out/in to cope with load surges. Given a graph G that is vertex-partitioned and distributed across n processors, it is to add (resp. remove) k processors and re-distribute G across n + k (resp. n --- k ) processors such that the load among the processors is balanced, and its replication factor and migration cost are minimized. We show that this tri-criteria optimization problem is intractable, even when k is a constant and when either load balancing or minimum migration is not required. Nonetheless, we propose two parallel solutions to dynamic scaling. One consists of approximation algorithms by extending consistent hashing. Given a load balancing factor above a lower bound, the algorithms guarantee provable bounds on both replication factor and migration cost. The other is a generic scaling scheme. Given any existing vertex-partitioner VP of users' choice, it adaptively scales VP in and out such that it incurs minimum migration cost, and ensures balance and replication factors within a bound relative to that of VP. Using real-life and synthetic graphs, we experimentally verify the efficiency, effectiveness and scalability of the solutions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2019:TTR, author = "Dongsheng Li and Yiming Zhang and Jinyan Wang and Kian-Lee Tan", title = "{TopoX}: topology refactorization for efficient graph partitioning and processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "8", pages = "891--905", month = apr, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3324301.3324306", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Traditional graph partitioning methods attempt to both minimize communication cost and guarantee load balancing in computation. However, the skewed degree distribution of natural graphs makes it difficult to simultaneously achieve the two objectives. This paper proposes topology refactorization (TR), a topology-aware method allowing graph-parallel systems to separately handle the two objectives: refactorization is mainly focused on reducing communication cost, and partitioning is mainly targeted for balancing the load. TR transforms a skewed graph into a more communication-efficient topology through fusion and fission, where the fusion operation organizes a set of neighboring low-degree vertices into a super-vertex, and the fission operation splits a high-degree vertex into a set of sibling sub-vertices. Based on TR, we design an efficient graph-parallel system (TopoX) which pipelines refactorization with partitioning to both reduce communication cost and balance computation load. Prototype evaluation shows that TopoX outperforms state-of-the-art PowerLyra by up to 78.5\% (from 37.2\%) on real-world graphs and is significantly faster than other graph-parallel systems, while only introducing small refactorization overhead and memory consumption.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Avdiukhin:2019:MDB, author = "Dmitrii Avdiukhin and Sergey Pupyrev and Grigory Yaroslavtsev", title = "Multi-dimensional balanced graph partitioning via projected gradient descent", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "8", pages = "906--919", month = apr, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3324301.3324307", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Motivated by performance optimization of large-scale graph processing systems that distribute the graph across multiple machines, we consider the balanced graph partitioning problem. Compared to most of the previous work, we study the multi-dimensional variant in which balance according to multiple weight functions is required. As we demonstrate by experimental evaluation, such multi-dimensional balance is essential for achieving performance improvements for typical distributed graph processing workloads. We propose a new scalable technique for the multidimensional balanced graph partitioning problem. It is based on applying randomized projected gradient descent to a non-convex continuous relaxation of the objective. We show how to implement the new algorithm efficiently in both theory and practice utilizing various approaches for the projection step. Experiments with large-scale graphs containing up to hundreds of billions of edges indicate that our algorithm has superior performance compared to the state of the art.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2019:EDS, author = "Lei Cao and Yizhou Yan and Samuel Madden and Elke A. Rundensteiner and Mathan Gopalsamy", title = "Efficient discovery of sequence outlier patterns", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "8", pages = "920--932", month = apr, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3324301.3324308", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern Internet of Things ( IoT ) applications generate massive amounts of time-stamped data, much of it in the form of discrete, symbolic sequences. In this work, we present a new system called TOP that deTects Outlier Patterns from these sequences. To solve the fundamental limitation of existing pattern mining semantics that miss outlier patterns hidden inside of larger frequent patterns, TOP offers new pattern semantics based on contextual patterns that distinguish the independent occurrence of a pattern from its occurrence as part of its super-pattern. We present efficient algorithms for the mining of this new class of contextual patterns. In particular, in contrast to the bottom-up strategy for state-of-the-art pattern mining techniques, our top-down Reduce strategy piggy backs pattern detection with the detection of the context in which a pattern occurs. Our approach achieves linear time complexity in the length of the input sequence. Effective optimization techniques such as context-driven search space pruning and inverted index-based outlier pattern detection are also proposed to further speed up contextual pattern mining. Our experimental evaluation demonstrates the effectiveness of TOP at capturing meaningful outlier patterns in several real-world IoT use cases. We also demonstrate the efficiency of TOP, showing it to be up to 2 orders of magnitude faster than adapting state-of-the-art mining to produce this new class of contextual outlier patterns, allowing us to scale outlier pattern mining to large sequence datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bogatov:2019:CEO, author = "Dmytro Bogatov and George Kollios and Leonid Reyzin", title = "A comparative evaluation of order-revealing encryption schemes and secure range-query protocols", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "8", pages = "933--947", month = apr, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3324301.3324309", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database query evaluation over encrypted data can allow database users to maintain the privacy of their data while outsourcing data processing. Order-Preserving Encryption (OPE) and Order-Revealing Encryption (ORE) were designed to enable efficient query execution, but provide only partial privacy. More private protocols, based on Searchable Symmetric Encryption (SSE), Oblivious RAM (ORAM) or custom encrypted data structures, have also been designed. In this paper, we develop a framework to provide the first comprehensive comparison among a number of range query protocols that ensure varying levels of privacy of user data. We evaluate five ORE-based and five generic range query protocols. We analyze and compare them both theoretically and experimentally and measure their performance over database indexing and query evaluation. We report not only execution time but also I/O performance, communication amount, and usage of cryptographic primitive operations. Our comparison reveals some interesting insights concerning the relative security and performance of these approaches in database settings.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Orakzai:2019:HFM, author = "Faisal Orakzai and Toon Calders and Torben Bach Pedersen", title = "$ k / 2$-hop: fast mining of convoy patterns with effective pruning", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "9", pages = "948--960", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3329772.3329773", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the increase of devices equipped with location sensors, mining spatio-temporal data for interesting behavioral patterns has gained attention in recent years. One of such well-known patterns is the convoy pattern which can be used, e.g., to find groups of people moving together in public transport or to prevent traffic jams. A convoy consists of at least m objects moving together for at least k consecutive time instants where m and k are user-defined parameters. Convoy mining is an expensive task and existing sequential algorithms do not scale to real-life dataset sizes. Existing sequential as well as parallel algorithms require a complex set of data-dependent parameters which are hard to set and tune. Therefore, in this paper, we propose a new fast exact sequential convoy pattern mining algorithm ``k/2-hop'' that is free of data-dependent parameters. The proposed algorithm processes the data corresponding to a few specific key timestamps at each step and quickly prunes objects with no possibility of forming a convoy. Thus, only a very small portion of the complete dataset is considered for mining convoys. Our experimental results show that k/2-hop outperforms existing sequential as well as parallel convoy pattern mining algorithms by orders of magnitude, and scales to larger datasets which existing algorithms fail on.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sun:2019:BAD, author = "Ji Sun and Zeyuan Shang and Guoliang Li and Dong Deng and Zhifeng Bao", title = "Balance-aware distributed string similarity-based query processing system", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "9", pages = "961--974", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3329772.3329774", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data analysts spend more than 80\% of time on data cleaning and integration in the whole process of data analytics due to data errors and inconsistencies. Similarity-based query processing is an important way to tolerate the errors and inconsistencies. However, similarity-based query processing is rather costly and traditional database cannot afford such expensive requirement. In this paper, we develop a distributed in-memory similarity-based query processing system called Dima. Dima supports four core similarity operations, i.e., similarity selection, similarity join, top- k selection and top- k join. Dima extends SQL for users to easily invoke these similarity-based operations in their data analysis tasks. To avoid expensive data transmission in a distributed environment, we propose balance-aware signatures where two records are similar if they share common signatures, and we can adaptively select the signatures to balance the workload. Dima builds signature-based global indexes and local indexes to support similarity operations. Since Spark is one of the widely adopted distributed in-memory computing systems, we have seamlessly integrated Dima into Spark and developed effective query optimization techniques in Spark. To the best of our knowledge, this is the first full-fledged distributed in-memory system that can support complex similarity-based query processing on large-scale datasets. We have conducted extensive experiments on four real-world datasets. Experimental results show that Dima outperforms state-of-the-art studies by 1--3 orders of magnitude and has good scalability.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ruan:2019:FGS, author = "Pingcheng Ruan and Gang Chen and Tien Tuan Anh Dinh and Qian Lin and Beng Chin Ooi and Meihui Zhang", title = "Fine-grained, secure and efficient data provenance on blockchain systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "9", pages = "975--988", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3329772.3329775", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The success of Bitcoin and other cryptocurrencies bring enormous interest to blockchains. A blockchain system implements a tamper-evident ledger for recording transactions that modify some global states. The system captures entire evolution history of the states. The management of that history, also known as data provenance or lineage, has been studied extensively in database systems. However, querying data history in existing blockchains can only be done by replaying all transactions. This approach is applicable to large-scale, offline analysis, but is not suitable for online transaction processing. We present LineageChain, a fine-grained, secure and efficient provenance system for blockchains. LineageChain exposes provenance information to smart contracts via simple and elegant interfaces, thereby enabling a new class of blockchain applications whose execution logics depend on provenance information at runtime. LineageChain captures provenance during contract execution, and efficiently stores it in a Merkle tree. LineageChain provides a novel skip list index designed for supporting efficient provenance query processing. We have implemented LineageChain on top of Hyperledger and a blockchain-optimized storage system called ForkBase. Our extensive evaluation of LineageChain demonstrates its benefits to the new class of blockchain applications, its efficient query, and its small storage overhead.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Choi:2019:PTK, author = "Dalsu Choi and Chang-Sup Park and Yon Dohn Chung", title = "Progressive top-$k$ subarray query processing in array databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "9", pages = "989--1001", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3329772.3329776", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Unprecedented amounts of multidimensional array data are currently being generated in many fields. These multidimensional array data naturally and efficiently fit into the array data model, and many array management systems based on the array data model have appeared. Accordingly, the requirement for data exploration methods for large multidimensional array data has also increased. In this paper, we propose a method for efficient top- k subarray query processing in array databases, which is one of the most important query types for exploring multidimensional data. First, we define novel top- k query models for array databases: overlap-allowing and disjoint top- k subarray queries. Second, we propose a suite of top- k subarray query processing methods, called PPTS and extend them to distributed processing. Finally, we present the results of extensive experiments using real datasets from an array database, which show that our proposed methods outperform existing na{\"\i}ve methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hoffmann:2019:MLC, author = "Moritz Hoffmann and Andrea Lattuada and Frank McSherry", title = "{Megaphone}: latency-conscious state migration for distributed streaming dataflows", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "9", pages = "1002--1015", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3329772.3329777", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We design and implement Megaphone, a data migration mechanism for stateful distributed dataflow engines with latency objectives. When compared to existing migration mechanisms, Megaphone has the following differentiating characteristics: (i) migrations can be subdivided to a configurable granularity to avoid latency spikes, and (ii) migrations can be prepared ahead of time to avoid runtime coordination. Megaphone is implemented as a library on an unmodified timely dataflow implementation, and provides an operator interface compatible with its existing APIs. We evaluate Megaphone on established benchmarks with varying amounts of state and observe that compared to na{\"\i}ve approaches Megaphone reduces service latencies during reconfiguration by orders of magnitude without significantly increasing steady-state overhead.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tam:2019:ADR, author = "Nguyen Thanh Tam and Matthias Weidlich and Bolong Zheng and Hongzhi Yin and Nguyen Quoc Viet Hung and Bela Stantic", title = "From anomaly detection to rumour detection using data streams of social platforms", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "9", pages = "1016--1029", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3329772.3329778", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Social platforms became a major source of rumours. While rumours can have severe real-world implications, their detection is notoriously hard: Content on social platforms is short and lacks semantics; it spreads quickly through a dynamically evolving network; and without considering the context of content, it may be impossible to arrive at a truthful interpretation. Traditional approaches to rumour detection, however, exploit solely a single content modality, e.g., social media posts, which limits their detection accuracy. In this paper, we cope with the aforementioned challenges by means of a multi-modal approach to rumour detection that identifies anomalies in both, the entities (e.g., users, posts, and hashtags) of a social platform and their relations. Based on local anomalies, we show how to detect rumours at the network level, following a graph-based scan approach. In addition, we propose incremental methods, which enable us to detect rumours using streaming data of social platforms. We illustrate the effectiveness and efficiency of our approach with a real-world dataset of 4M tweets with more than 1000 rumours.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gupta:2019:OIT, author = "Peeyush Gupta and Yin Li and Sharad Mehrotra and Nisha Panwar and Shantanu Sharma and Sumaya Almanee", title = "{Obscure}: information-theoretic oblivious and verifiable aggregation queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "9", pages = "1030--1043", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3329772.3329779", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Despite extensive research on cryptography, secure and efficient query processing over outsourced data remains an open challenge. We develop communication-efficient and information-theoretically secure algorithms for privacy-preserving aggregation queries using multi-party computation (MPC). Specifically, query processing techniques over secret-shared data outsourced by single or multiple database owners are developed. These algorithms allow a user to execute queries on the secret-shared database and also prevent the network and the (adversarial) clouds to learn the user's queries, results, or the database. We further develop (non-mandatory) privacy-preserving result verification algorithms that detect malicious behaviors, and experimentally validate the efficiency of our approach over large datasets, the size of which prior approaches to secret-sharing or MPC systems have not scaled to.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dutt:2019:SER, author = "Anshuman Dutt and Chi Wang and Azade Nazi and Srikanth Kandula and Vivek Narasayya and Surajit Chaudhuri", title = "Selectivity estimation for range predicates using lightweight models", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "9", pages = "1044--1057", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3329772.3329780", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:01 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query optimizers depend on selectivity estimates of query predicates to produce a good execution plan. When a query contains multiple predicates, today's optimizers use a variety of assumptions, such as independence between predicates, to estimate selectivity. While such techniques have the benefit of fast estimation and small memory footprint, they often incur large selectivity estimation errors. In this work, we reconsider selectivity estimation as a regression problem. We explore application of neural networks and tree-based ensembles to the important problem of selectivity estimation of multi-dimensional range predicates. While their straightforward application does not outperform even simple baselines, we propose two simple yet effective design choices, i.e., regression label transformation and feature engineering, motivated by the selectivity estimation context. Through extensive empirical evaluation across a variety of datasets, we show that the proposed models deliver both highly accurate estimates as well as fast estimation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yuan:2019:CSP, author = "Ye Yuan and Xiang Lian and Guoren Wang and Yuliang Ma and Yishu Wang", title = "Constrained shortest path query in a large time-dependent graph", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "10", pages = "1058--1070", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3339490.3339491", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The constrained shortest path (CSP) query over static graphs has been extensively studied, since it has wide applications in transportation networks, telecommunication networks and etc. Such networks are dynamic and evolve over time, being modeled as time-dependent graphs. Therefore, in this paper, we study the CSP query over a large time-dependent graph. Specifically, we study the point CSP (PCSP) query and interval CSP (ICSP) query. We formally prove that it is NP-complete to process a PCSP query and at least EXPSPACE to answer an ICSP query. We propose approximate sequential algorithms to answer the PCSP and ICSP queries efficiently. We also develop parallel algorithms for the queries that guarantee to scale with big time-dependent graphs. Using real-life graphs, we experimentally verify the efficiency and scalability of our algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chu:2019:FTC, author = "Lingyang Chu and Zhefeng Wang and Jian Pei and Yanyan Zhang and Yu Yang and Enhong Chen", title = "Finding theme communities from database networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "10", pages = "1071--1084", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3339490.3339492", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a database network where each vertex is associated with a transaction database, we are interested in finding theme communities. Here, a theme community is a cohesive subgraph such that a common pattern is frequent in all transaction databases associated with the vertices in the subgraph. Finding all theme communities from a database network enjoys many novel applications. However, it is challenging since even counting the number of all theme communities in a database network is \#P-hard. Inspired by the observation that a theme community shrinks when the length of the pattern increases, we investigate several properties of theme communities and develop TCFI, a scalable algorithm that uses these properties to effectively prune the patterns that cannot form any theme community. We also design TC-Tree, a scalable algorithm that decomposes and indexes theme communities efficiently. Retrieving a ranked list of theme communities from a TC-Tree of hundreds of millions of theme communities takes less than 1 second. Extensive experiments and a case study demonstrate the effectiveness and scalability of TCFI and TC-Tree in discovering and querying meaningful theme communities from large database networks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pan:2019:RSB, author = "James J. Pan and Guoliang Li and Juntao Hu", title = "{Ridesharing}: simulator, benchmark, and evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "10", pages = "1085--1098", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3339490.3339493", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Ridesharing is becoming a popular mode of transportation with profound effects on the industry. Recent algorithms for vehicle-to-customer matching have been developed; yet cross-study evaluations of their performance and applicability to real-world ridesharing are lacking. Evaluation is complicated by the online and real-time nature of the ridesharing problem. In this paper, we develop a simulator for evaluating ridesharing algorithms, and we provide a set of benchmarks to test a wide range of scenarios encountered in the real world. These scenarios include different road networks, different numbers of vehicles, larger scales of customer requests, and others. We apply the benchmarks to several state-of-the-art search and join based ridesharing algorithms to demonstrate the usefulness of the simulator and the benchmarks. We find quickly-computable heuristics outperforming other more complex methods, primarily due to faster computation speed. Our work points the direction for designing and evaluating future ridesharing algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lai:2019:DSM, author = "Longbin Lai and Zhu Qing and Zhengyi Yang and Xin Jin and Zhengmin Lai and Ran Wang and Kongzhang Hao and Xuemin Lin and Lu Qin and Wenjie Zhang and Ying Zhang and Zhengping Qian and Jingren Zhou", title = "Distributed subgraph matching on timely dataflow", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "10", pages = "1099--1112", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3339490.3339494", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently there emerge many distributed algorithms that aim at solving subgraph matching at scale. Existing algorithm-level comparisons failed to provide a systematic view of distributed subgraph matching mainly due to the intertwining of strategy and optimization. In this paper, we identify four strategies and three general-purpose optimizations from representative state-of-the-art algorithms. We implement the four strategies with the optimizations based on the common Timely dataflow system for systematic strategy-level comparison. Our implementation covers all representative algorithms. We conduct extensive experiments for both unlabelled matching and labelled matching to analyze the performance of distributed subgraph matching under various settings, which is finally summarized as a practical guide.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qiao:2019:HDS, author = "Shi Qiao and Adrian Nicoara and Jin Sun and Marc Friedman and Hiren Patel and Jaliya Ekanayake", title = "Hyper dimension shuffle: efficient data repartition at petabyte scale in {SCOPE}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "10", pages = "1113--1125", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3339490.3339495", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In distributed query processing, data shuffle is one of the most costly operations. We examined scaling limitations to data shuffle that current systems and the research literature do not solve. As the number of input and output partitions increases, na{\"\i}ve shuffling will result in high fan-out and fan-in. There are practical limits to fan-out, as a consequence of limits on memory buffers, network ports and I/O handles. There are practical limits to fan-in because it multiplies the communication errors due to faults in commodity clusters impeding progress. Existing solutions that limit fan-out and fan-in do so at the cost of scaling quadratically in the number of nodes in the data flow graph. This dominates the costs of shuffling large datasets. We propose a novel algorithm called Hyper Dimension Shuffle that we have introduced in production in SCOPE, Microsoft's internal big data analytics system. Hyper Dimension Shuffle is inspired by the divide and conquer concept, and utilizes a recursive partitioner with intermediate aggregations. It yields quasilinear complexity of the shuffling graph with tight guarantees on fan-out and fan-in. We demonstrate how it avoids the shuffling graph blow-up of previous algorithms to shuffle at petabyte-scale efficiently on both synthetic benchmarks and real applications.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cormode:2019:ARQ, author = "Graham Cormode and Tejas Kulkarni and Divesh Srivastava", title = "Answering range queries under local differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "10", pages = "1126--1138", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3339490.3339496", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Counting the fraction of a population having an input within a specified interval i.e. a range query, is a fundamental data analysis primitive. Range queries can also be used to compute other core statistics such as quantiles, and to build prediction models. However, frequently the data is subject to privacy concerns when it is drawn from individuals, and relates for example to their financial, health, religious or political status. In this paper, we introduce and analyze methods to support range queries under the local variant of differential privacy [23], an emerging standard for privacy-preserving data analysis. The local model requires that each user releases a noisy view of her private data under a privacy guarantee. While many works address the problem of range queries in the trusted aggregator setting, this problem has not been addressed specifically under untrusted aggregation (local DP) model even though many primitives have been developed recently for estimating a discrete distribution. We describe and analyze two classes of approaches for range queries, based on hierarchical histograms and the Haar wavelet transform. We show that both have strong theoretical accuracy guarantees on variance. In practice, both methods are fast and require minimal computation and communication resources. Our experiments show that the wavelet approach is most accurate in high privacy settings, while the hierarchical approach dominates for weaker privacy requirements.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2019:VPB, author = "Kai Wang and Xuemin Lin and Lu Qin and Wenjie Zhang and Ying Zhang", title = "Vertex priority based butterfly counting for large-scale bipartite networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "10", pages = "1139--1152", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3339490.3339497", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Bipartite networks are of great importance in many real-world applications. In bipartite networks, butterfly (i.e., a complete 2 x 2 biclique) is the smallest non-trivial cohesive structure and plays a key role. In this paper, we study the problem of efficient counting the number of butterflies in bipartite networks. The most advanced techniques are based on enumerating wedges which is the dominant cost of counting butterflies. Nevertheless, the existing algorithms cannot efficiently handle large-scale bipartite networks. This becomes a bottleneck in large-scale applications. In this paper, instead of the existing layer-priority-based techniques, we propose a vertex-priority-based paradigm BFC-VP to enumerate much fewer wedges; this leads to a significant improvement of the time complexity of the state-of-the-art algorithms. In addition, we present cache-aware strategies to further improve the time efficiency while theoretically retaining the time complexity of BFC-VP. Moreover, we also show that our proposed techniques can work efficiently in external and parallel contexts. Our extensive empirical studies demonstrate that the proposed techniques can speed up the state-of-the-art techniques by up to two orders of magnitude for the real datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2019:BVS, author = "Yang Cao and Wenfei Fan and Tengfei Yuan", title = "Block as a value for {SQL} over {NoSQL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "10", pages = "1153--1166", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3339490.3339498", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents Zidian, a middleware for key--value (KV) stores to speed up SQL query evaluation over NoSQL. As opposed to common practice that takes a tuple id or primary key as key and the entire tuple as value, Zidian proposes a block-as-a-value model BaaV. BaaV represents a relation as keyed blocks ( k, B ), where k is a key of a block (a set) B of partial tuples. We extend relational algebra to BaaV. We show that under BaaV, Zidian substantially reduces data access and communication cost. We provide characterizations (sufficient and necessary conditions) for (a) result-preserving queries, i.e., queries covered by available BaaV stores, (b) scan-free queries, i.e., queries that can be evaluated without scanning any table, and (c) bounded queries, i.e., queries that can be answered by accessing a bounded amount of data. We show that in parallel processing, Zidian guarantees (a) no scans for scan-free queries, (b) bounded communication cost for bounded queries; and (c) parallel scalability, i.e., speed up when adding processors. Moreover, Zidian can be plugged into existing SQL-over-NoSQL systems and retains horizontal scalability. Using benchmark and real-life data, we empirically verify that Zidian improves existing SQL-over-NoSQL systems by 2 orders of magnitude on average.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tangwongsan:2019:OGO, author = "Kanat Tangwongsan and Martin Hirzel and Scott Schneider", title = "Optimal and general out-of-order sliding-window aggregation", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "10", pages = "1167--1180", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3339490.3339499", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Sliding-window aggregation derives a user-defined summary of the most-recent portion of a data stream. For in-order streams, each window change can be handled in O (1) time even when the aggregation operator is not invertible. But streaming data often arrive inherently out-of-order, e.g., due to clock drifts and communication delays. For such streams, prior work resorted to latency-prone buffering or spent O (log n ) time for every window change, where n is the instantaneous window size. This paper presents FiBA, a novel real-time sliding window aggregation algorithm that optimally handles streams of varying degrees of out-of-orderness. FiBA is as general as the state-of-the-art and supports variable-sized windows. An insert or evict takes amortized O (log d ) time, where d is the distance of the change to the window's boundary. This means O (1) time for in-order arrivals and nearly O (1) time for slightly out-of-order arrivals, tending to O (log n ) time for the most severely out-of-order arrivals. We also prove a matching lower bound, showing optimality. At its heart, the algorithm combines and extends finger searching, lazy rebalancing, and position-aware partial aggregates. Further, FiBA can answer range queries that aggregate subwindows for window sharing. Finally, our experiments show that FiBA performs well in practice and conforms to the theoretical findings, with significantly higher throughput than O (log n ) algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tang:2019:CTR, author = "Bo Tang and Kyriakos Mouratidis and Man Lung Yiu and Zhenyu Chen", title = "Creating top ranking options in the continuous option and preference space", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "10", pages = "1181--1194", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3339490.3339500", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Top- k queries are extensively used to retrieve the k most relevant options (e.g., products, services, accommodation alternatives, etc) based on a weighted scoring function that captures user preferences. In this paper, we take the viewpoint of a business owner who plans to introduce a new option to the market, with a certain type of clientele in mind. Given a target region in the consumer spectrum, we determine what attribute values the new option should have, so that it ranks among the top- k for any user in that region. Our methodology can also be used to improve an existing option, at the minimum modification cost, so that it ranks consistently high for an intended type of customers. This is the first work on competitive option placement where no distinct user(s) are targeted, but a general clientele type, i.e., a continuum of possible preferences. Here also lies our main challenge (and contribution), i.e., dealing with the interplay between two continuous spaces: the targeted region in the preference spectrum, and the option domain (where the new option will be placed). At the core of our methodology lies a novel and powerful interlinking between the two spaces. Our algorithms offer exact answers in practical response times, even for the largest of the standard benchmark datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ma:2019:OBE, author = "Hanchao Ma and Morteza Alipourlangouri and Yinghui Wu and Fei Chiang and Jiaxing Pi", title = "Ontology-based entity matching in attributed graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "10", pages = "1195--1207", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3339490.3339501", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Keys for graphs incorporate the topology and value constraints needed to uniquely identify entities in a graph. They have been studied to support object identification, knowledge fusion, and social network reconciliation. Existing key constraints identify entities as the matches of a graph pattern by subgraph isomorphism, which enforce label equality on node types. These constraints can be too restrictive to characterize structures and node labels that are syntactically different but semantically equivalent. We propose a new class of key constraints, Ontological Graph Keys (OGKs) that extend conventional graph keys by ontological subgraph matching between entity labels and an external ontology. We show that the implication and validation problems for OGKs are each NP-complete. To reduce the entity matching cost, we also provide an algorithm to compute a minimal cover for OGKs. We then study the entity matching problem with OGKs, and a practical variant with a budget on the matching cost. We develop efficient algorithms to perform entity matching based on a (budgeted) Chase procedure. Using real-world graphs, we experimentally verify the efficiency and accuracy of OGK-based entity matching.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2019:RTD, author = "Lu Chen and Yunjun Gao and Ziquan Fang and Xiaoye Miao and Christian S. Jensen and Chenjuan Guo", title = "Real-time distributed co-movement pattern detection on streaming trajectories", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "10", pages = "1208--1220", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3339490.3339502", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the widespread deployment of mobile devices with positioning capabilities, increasingly massive volumes of trajectory data are being collected that capture the movements of people and vehicles. This data enables co-movement pattern detection, which is important in applications such as trajectory compression and future-movement prediction. Existing co-movement pattern detection studies generally consider historical data and thus propose offline algorithms. However, applications such as future movement prediction need real-time processing over streaming trajectories. Thus, we investigate real-time distributed co-movement pattern detection over streaming trajectories. Existing off-line methods assume that all data is available when the processing starts. Nevertheless, in a streaming setting, unbounded data arrives in real time, making pattern detection challenging. To this end, we propose a framework based on Apache Flink, which is designed for efficient distributed streaming data processing. The framework encompasses two phases: clustering and pattern enumeration. To accelerate the clustering, we use a range join based on two-layer indexing, and provide techniques that eliminate unnecessary verifications. To perform pattern enumeration efficiently, we present two methods FBA and VBA that utilize id-based partitioning. When coupled with bit compression and candidate-based enumeration techniques, we reduce the enumeration cost from exponential to linear. Extensive experiments offer insight into the efficiency of the proposed framework and its constituent techniques compared with existing methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tan:2019:IIB, author = "Jian Tan and Tieying Zhang and Feifei Li and Jie Chen and Qixing Zheng and Ping Zhang and Honglin Qiao and Yue Shi and Wei Cao and Rui Zhang", title = "{iBTune}: individualized buffer tuning for large-scale cloud databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "10", pages = "1221--1234", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3339490.3339503", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Tuning the buffer size appropriately is critical to the performance of a cloud database, since memory is usually the resource bottleneck. For large-scale databases supporting heterogeneous applications, configuring the individual buffer sizes for a significant number of database instances presents a scalability challenge. Manual optimization is neither efficient nor effective, and even not feasible for large cloud clusters, especially when the workload may dynamically change on each instance. The difficulty lies in the fact that each database instance requires a different buffer size that is highly individualized, subject to the constraint of the total buffer memory space. It is imperative to resort to algorithms that automatically orchestrate the buffer pool tuning for the entire database instances. To this end, we design iBTune that has been deployed for more than 10, 000 OLTP cloud database instances in our production system. Specifically, it leverages the information from similar workloads to find out the tolerable miss ratio of each instance. Then, it utilizes the relationship between miss ratios and allocated memory sizes to individually optimize the target buffer pool sizes. To provide a guaranteed level of service level agreement (SLA), we design a pairwise deep neural network that uses features from measurements on pairs of instances to predict the upper bounds of the request response times. A target buffer pool size can be adjusted only when the predicted response time upper bound is in a safe limit. The successful deployment on a production environment, which safely reduces the memory footprint by more than 17\% compared to the original system that relies on manual configurations, demonstrates the effectiveness of our solution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Whittaker:2019:OTI, author = "Michael Whittaker and Nick Edmonds and Sandeep Tata and James B. Wendt and Marc Najork", title = "Online template induction for machine-generated emails", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1235--1248", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342264", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In emails, information abounds. Whether it be a bill reminder, a hotel confirmation, or a shipping notification, our emails contain useful bits of information that enable a number of applications. Most of this email traffic is machine-generated, sent from a business to a human. These business-to-consumer emails are typically instantiated from a set of email templates, and discovering these templates is a key step in enabling a variety of intelligent experiences. Existing email information extraction systems typically separate information extraction into two steps: an offline template discovery process (called template induction) that is periodically run on a sample of emails, and an online email annotation process that applies discovered templates to emails as they arrive. Since information extraction requires an email's template to be known, any delay in discovering a newly created template causes missed extractions, lowering the overall extraction coverage. In this paper, we present a novel system called Crusher that discovers templates completely online, reducing template discovery delay from a week (for the existing MapReduce-based batch system) to minutes. Furthermore, Crusher has a resource consumption footprint that is significantly smaller than the existing batch system. We also report on the surprising lesson we learned that conventional stream processing systems do not present a good framework on which to build Crusher. Crusher delivers an order of magnitude more throughput than a prototype built using a stream processing engine. We hope that these lessons help designers of stream processing systems accommodate a broader range of applications like online template induction in the future.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2019:QSP, author = "Yong Wang and Guoliang Li and Nan Tang", title = "Querying shortest paths on time dependent road networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1249--1261", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342265", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "For real-world time dependent road networks (TDRNs), answering shortest path-based route queries and plans in real-time is highly desirable by many industrial applications. Unfortunately, traditional ( Dijkstra --- or A *-like) algorithms are computationally expensive for such tasks on TDRNs. Naturally, indexes are needed to meet the real-time constraint required by real applications. In this paper, we propose a novel height-balanced tree-structured index, called TD-G-tree, which supports fast route queries over TDRNs. The key idea is to use hierarchical graph partitioning to split a road network into hierarchical partitions. This will produce a balanced tree, where each tree node corresponds to a partition and each parent-child relationship corresponds to a partition and its sub-partition. We then compute and index time dependent shortest paths (TDSPs) only for borders ( i.e., vertices whose edges are cut by a partition). Based on TD-G-tree, we devise efficient algorithms to support TDSP queries, as well as time-interval based route planning, for computing optimal solutions through dynamic programming and chronological divide-and-conquer. Extensive experiments on real-world datasets show that our method significantly outperforms existing approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fariha:2019:EDQ, author = "Anna Fariha and Alexandra Meliou", title = "Example-driven query intent discovery: abductive reasoning using semantic similarity", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1262--1275", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342266", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Traditional relational data interfaces require precise structured queries over potentially complex schemas. These rigid data retrieval mechanisms pose hurdles for non-expert users, who typically lack language expertise and are unfamiliar with the details of the schema. Query by Example (QBE) methods offer an alternative mechanism: users provide examples of their intended query output and the QBE system needs to infer the intended query. However, these approaches focus on the structural similarity of the examples and ignore the richer context present in the data. As a result, they typically produce queries that are too general, and fail to capture the user's intent effectively. In this paper, we present SQuID, a system that performs semantic similarity-aware query intent discovery. Our work makes the following contributions: (1) We design an end-to-end system that automatically formulates select-project-join queries in an open-world setting, with optional group-by aggregation and intersection operators; a much larger class than prior QBE techniques. (2) We express the problem of query intent discovery using a probabilistic abduction model, that infers a query as the most likely explanation of the provided examples. (3) We introduce the notion of an abduction-ready database, which precomputes semantic properties and related statistics, allowing SQuID to achieve real-time performance. (4) We present an extensive empirical evaluation on three real-world datasets, including user-intent case studies, demonstrating that SQuID is efficient and effective, and outperforms machine learning methods, as well as the state-of-the-art in the related query reverse engineering problem.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhou:2019:AVQ, author = "Qi Zhou and Joy Arulraj and Shamkant Navathe and William Harris and Dong Xu", title = "Automated verification of query equivalence using satisfiability modulo theories", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1276--1288", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342267", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database-as-a-service offerings enable users to quickly create and deploy complex data processing pipelines. In practice, these pipelines often exhibit significant overlap of computation due to redundant execution of certain sub-queries. It is challenging for developers and database administrators to manually detect overlap across queries since they may be distributed across teams, organization roles, and geographic locations. Thus, we require automated cloud-scale tools for identifying equivalent queries to minimize computation overlap. State-of-the-art algebraic approaches to automated verification of query equivalence suffer from two limitations. First, they are unable to model the semantics of widely-used SQL features, such as complex query predicates and three-valued logic. Second, they have a computationally intensive verification procedure. These limitations restrict their efficacy and efficiency in cloud-scale database-as-a-service offerings. This paper makes the case for an alternate approach to determining query equivalence based on symbolic representation. The key idea is to effectively transform a wide range of SQL queries into first order logic formulae and then use satisfiability modulo theories to efficiently verify their equivalence. We have implemented this symbolic representation-based approach in EQUITAS. Our evaluation shows that EQUITAS proves the semantic equivalence of a larger set of query pairs compared to algebraic approaches and reduces the verification time by 27X. We also demonstrate that on a set of 17,461 real-world SQL queries, it automatically identifies redundant execution across 11\% of the queries. Our symbolic-representation based technique is currently deployed on Alibaba's MaxCompute database-as-a-service platform.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xu:2019:TUF, author = "Pengfei Xu and Jiaheng Lu", title = "Towards a unified framework for string similarity joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1289--1302", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342268", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A similarity join aims to find all similar pairs between two collections of records. Established algorithms utilise different similarity measures, either syntactic or semantic, to quantify the similarity between two records. However, when records are similar in forms of a mixture of syntactic and semantic relations, utilising a single measure becomes inadequate to disclose the real similarity between records, and hence unable to obtain high-quality join results. In this paper, we study a unified framework to find similar records by combining multiple similarity measures. To achieve this goal, we first develop a new similarity framework that unifies the existing three kinds of similarity measures simultaneously, including syntactic (typographic) similarity, synonym-based similarity, and taxonomy-based similarity. We then theoretically prove that finding the maximum unified similarity between two strings is generally NP -hard, and furthermore develop an approximate algorithm which runs in polynomial time with a non-trivial approximation guarantee. To support efficient string joins based on our unified similarity measure, we adopt the filter-and-verification framework and propose a new signature structure, called pebble, which can be simultaneously adapted to handle multiple similarity measures. The salient feature of our approach is that, it can judiciously select the best pebble signatures and the overlap thresholds to maximise the filtering power. Extensive experiments show that our methods are capable of finding similar records having mixed types of similarity relations, while exhibiting high efficiency and scalability for similarity joins. The implementation can be downloaded at https://github.com/HY-UDBMS/AU-Join.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yoon:2019:NEF, author = "Susik Yoon and Jae-Gil Lee and Byung Suk Lee", title = "{NETS}: extremely fast outlier detection from a data stream via set-based processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1303--1315", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342269", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper addresses the problem of efficiently detecting outliers from a data stream as old data points expire from and new data points enter the window incrementally. The proposed method is based on a newly discovered characteristic of a data stream that the change in the locations of data points in the data space is typically very insignificant. This observation has led to the finding that the existing distance-based outlier detection algorithms perform excessive unnecessary computations that are repetitive and/or canceling out the effects. Thus, in this paper, we propose a novel set-based approach to detecting outliers, whereby data points at similar locations are grouped and the detection of outliers or inliers is handled at the group level. Specifically, a new algorithm NETS is proposed to achieve a remarkable performance improvement by realizing set-based early identification of outliers or inliers and taking advantage of the ``net effect'' between expired and new data points. Additionally, NETS is capable of achieving the same efficiency even for a high-dimensional data stream through two-level dimensional filtering. Comprehensive experiments using six real-world data streams show 5 to 25 times faster processing time than state-of-the-art algorithms with comparable memory consumption. We assert that NETS opens a new possibility to real-time data stream outlier detection.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lu:2019:SST, author = "Yi Lu and Xiangyao Yu and Samuel Madden", title = "{STAR}: scaling transactions through asymmetric replication", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1316--1329", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342270", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we present STAR, a new distributed in-memory database with asymmetric replication. By employing a single-node non-partitioned architecture for some replicas and a partitioned architecture for other replicas, STAR is able to efficiently run both highly partitionable workloads and workloads that involve cross-partition transactions. The key idea is a new phase-switching algorithm where the execution of single-partition and cross-partition transactions is separated. In the partitioned phase, single-partition transactions are run on multiple machines in parallel to exploit more concurrency. In the single-master phase, mastership for the entire database is switched to a single designated master node, which can execute these transactions without the use of expensive coordination protocols like two-phase commit. Because the master node has a full copy of the database, this phase-switching can be done at negligible cost. Our experiments on two popular benchmarks (YCSB and TPC-C) show that high availability via replication can coexist with fast serializable transaction execution in distributed in-memory databases, with STAR outperforming systems that employ conventional concurrency control and replication algorithms by up to one order of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2019:SD, author = "Yuliang Li and Aaron Feng and Jinfeng Li and Saran Mumick and Alon Halevy and Vivian Li and Wang-Chiew Tan", title = "Subjective databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1330--1343", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342271", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Online users are constantly seeking experiences, such as a hotel with clean rooms and a lively bar, or a restaurant for a romantic rendezvous. However, e-commerce search engines only support queries involving objective attributes such as location, price, and cuisine, and any experiential data is relegated to text reviews. In order to support experiential queries, a database system needs to model subjective data. Users should be able to pose queries that specify subjective experiences using their own words, in addition to conditions on the usual objective attributes. This paper introduces OpineDB, a subjective database system that addresses these challenges. We introduce a data model for subjective databases. We describe how OpineDB translates subjective queries against the subjective database schema, which is done by matching the user query phrases to the underlying schema. We also show how the experiential conditions specified by the user can be combined and the results aggregated and ranked. We demonstrate that subjective databases satisfy user needs more effectively and accurately than alternative techniques through experiments with real data of hotel and restaurant reviews.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ren:2019:FRD, author = "Xuguang Ren and Junhu Wang and Wook-Shin Han and Jeffrey Xu Yu", title = "Fast and robust distributed subgraph enumeration", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1344--1356", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342272", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the subgraph enumeration problem under distributed settings. Existing solutions either suffer from severe memory crisis or rely on large indexes, which makes them impractical for very large graphs. Most of them follow a synchronous model where the performance is often bottlenecked by the machine with the worst performance. Motivated by this, in this paper, we propose RADS, a Robust Asynchronous Distributed Subgraph enumeration system. RADS first identifies results that can be found using single-machine algorithms. This strategy not only improves the overall performance but also reduces network communication and memory cost. Moreover, RADS employs a novel region-grouped multi-round expand verify \& filter framework which does not need to shuffle and exchange the intermediate results, nor does it need to replicate a large part of the data graph in each machine. This feature not only reduces network communication cost and memory usage, but also allows us to adopt simple strategies for memory control and load balancing, making it more robust. Several optimization strategies are also used in RADS to further improve the performance. Our experiments verified the superiority of RADS to state-of-the-art subgraph enumeration approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fu:2019:EEL, author = "Fangeheng Fu and Jiawei Jiang and Yingxia Shao and Bin Cui", title = "An experimental evaluation of large scale {GBDT} systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1357--1370", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342273", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Gradient boosting decision tree (GBDT) is a widely-used machine learning algorithm in both data analytic competitions and real-world industrial applications. Further, driven by the rapid increase in data volume, efforts have been made to train GBDT in a distributed setting to support large-scale workloads. However, we find it surprising that the existing systems manage the training dataset in different ways, but none of them have studied the impact of data management. To that end, this paper aims to study the pros and cons of different data management methods regarding the performance of distributed GBDT. We first introduce a quadrant categorization of data management policies based on data partitioning and data storage. Then we conduct an in-depth systematic analysis and summarize the advantageous scenarios of the quadrants. Based on the analysis, we further propose a novel distributed GBDT system named Vero, which adopts the unexplored composition of vertical partitioning and row-store and suits for many large-scale cases. To validate our analysis empirically, we implement different quadrants in the same code base and compare them under extensive workloads, and finally compare Vero with other state-of-the-art systems over a wide range of datasets. Our theoretical and experimental results provide a guideline on choosing a proper data management policy for a given workload.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kotsogiannis:2019:PDP, author = "Ios Kotsogiannis and Yuchao Tao and Xi He and Maryam Fanaeepour and Ashwin Machanavajjhala and Michael Hay and Gerome Miklau", title = "{PrivateSQL}: a differentially private {SQL} query engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1371--1384", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342274", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Differential privacy is considered a de facto standard for private data analysis. However, the definition and much of the supporting literature applies to flat tables. While there exist variants of the definition and specialized algorithms for specific types of relational data (e.g. graphs), there isn't a general privacy definition for multi-relational schemas with constraints, and no system that permits accurate differentially private answering of SQL queries while imposing a fixed privacy budget across all queries posed by the analyst. This work presents PrivateSQL, a first-of-its-kind end-to-end differentially private relational database system. PrivateSQL allows an analyst to query data stored in a standard database management system using a rich class of SQL counting queries. PrivateSQL adopts a novel generalization of differential privacy to multi-relational data that takes into account constraints in the schema like foreign keys, and allows the data owner to flexibly specify entities in the schema that need privacy. PrivateSQL ensures a fixed privacy loss across all the queries posed by the analyst by answering queries on private synopses generated from several views over the base relation that are tuned to have low error on a representative query workload. We experimentally evaluate PrivateSQL on a real-world dataset and a workload of more than 3, 600 queries. We show that for 50\% of the queries PrivateSQL offers at least 1, 000x better error rates than solutions adapted from prior work.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Amiri:2019:CCA, author = "Mohammad Javad Amiri and Divyakant Agrawal and Amr {El Abbadi}", title = "{CAPER}: a cross-application permissioned blockchain", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1385--1398", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342275", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Despite recent intensive research, existing blockchain systems do not adequately address all the characteristics of distributed applications. In particular, distributed applications collaborate with each other following service level agreements (SLAs) to provide different services. While collaboration between applications, e.g., cross-application transactions, should be visible to all applications, the internal data of each application, e.g, internal transactions, might be confidential. In this paper, we introduce CAPER, a permissioned blockchain system to support both internal and cross-application transactions of collaborating distributed applications. In CAPER, the blockchain ledger is formed as a directed acyclic graph where each application accesses and maintains only its own view of the ledger including its internal and all cross-application transactions. CAPER also introduces three consensus protocols to globally order cross-application transactions between applications with different internal consensus protocols. The experimental results reveal the efficiency of CAPER in terms of performance and scalability.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Koliousis:2019:CSD, author = "Alexandros Koliousis and Pijika Watcharapichat and Matthias Weidlich and Luo Mai and Paolo Costa and Peter Pietzuch", title = "{Crossbow}: scaling deep learning with small batch sizes on multi-{GPU} servers", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1399--1412", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342276", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Deep learning models are trained on servers with many GPUs, and training must scale with the number of GPUs. Systems such as TensorFlow and Caffe2 train models with parallel synchronous stochastic gradient descent: they process a batch of training data at a time, partitioned across GPUs, and average the resulting partial gradients to obtain an updated global model. To fully utilise all GPUs, systems must increase the batch size, which hinders statistical efficiency. Users tune hyper-parameters such as the learning rate to compensate for this, which is complex and model-specific. We describe Crossbow, a new single-server multi-GPU system for training deep learning models that enables users to freely choose their preferred batch size---however small---while scaling to multiple GPUs. Crossbow uses many parallel model replicas and avoids reduced statistical efficiency through a new synchronous training method. We introduce SMA, a synchronous variant of model averaging in which replicas independently explore the solution space with gradient descent, but adjust their search synchronously based on the trajectory of a globally-consistent average model. Crossbow achieves high hardware efficiency with small batch sizes by potentially training multiple model replicas per GPU, automatically tuning the number of replicas to maximise throughput. our experiments show that Crossbow improves the training time of deep learning models on an 8-GPU server by 1.3--4X compared to TensorFlow.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Feng:2019:FAA, author = "Kaiyu Feng and Gao Cong and Christian S. Jensen and Tao Guo", title = "Finding attribute-aware similar regions for data analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1414--1426", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342277", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the proliferation of mobile devices and location-based services, increasingly massive volumes of geo-tagged data are becoming available. This data typically also contains non-location information. We study how to use such information to characterize a region and then how to find a region of the same size and with the most similar characteristics. This functionality enables a user to identify regions that share characteristics with a user-supplied region that the user is familiar with and likes. More specifically, we formalize and study a new problem called the attribute-aware similar region search ( ASRS ) problem. We first define so-called composite aggregators that are able to express aspects of interest in terms of the information associated with a user-supplied region. When applied to a region, an aggregator captures the region's relevant characteristics. Next, given a query region and a composite aggregator, we propose a novel algorithm called DS-Search to find the most similar region of the same size. Unlike any previous work on region search, DS-Search repeatedly discretizes and splits regions until an split region either satisfies a drop condition or it is guaranteed to not contribute to the result. In addition, we extend DS-Search to solve the ASRS problem approximately. Finally, we report on extensive empirical studies that offer insight into the efficiency and effectiveness of the paper's proposals.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tang:2019:IQP, author = "Dixin Tang and Zechao Shang and Aaron J. Elmore and Sanjay Krishnan and Michael J. Franklin", title = "Intermittent query processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1427--1441", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342278", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many applications ingest data in an intermittent, yet largely predictable, pattern. Existing systems tend to ignore how data arrives when making decisions about how to update (or refresh) an ongoing query. To address this shortcoming we propose a new query processing paradigm, Intermittent Query Processing (IQP), that bridges query execution and policies, to determine when to update results and how much resources to allocate for ensuring fast query updates. Here, for a query the system provides an initial result that is to be refreshed when policy dictates, such as after a defined number of new records arrive or a time interval elapses. In between intermittent data arrivals, IQP inactivates query execution by selectively releasing some resources occupied in normal execution that will be least helpful (for future refreshes) according to the arrival patterns for new records. We present an IQP prototype based on PostgreSQL that selectively persists the state associated with query operators to allow for fast query updates while constraining resource consumption. Our experiments show that for several application scenarios IQP greatly lowers query processing latency compared to batch systems, and largely reduces memory consumption with comparable latency compared to a state-of-the-art incremental view maintenance technique.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Budiu:2019:HTC, author = "Mihai Budiu and Parikshit Gopalan and Lalith Suresh and Udi Wieder and Han Kruiger and Marcos K. Aguilera", title = "{Hillview}: a trillion-cell spreadsheet for big data", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1442--1457", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342279", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Hillview is a distributed spreadsheet for browsing very large datasets that cannot be handled by a single machine. As a spread-sheet, Hillview provides a high degree of interactivity that permits data analysts to explore information quickly along many dimensions while switching visualizations on a whim. To provide the required responsiveness, Hillview introduces visualization sketches, or vizketches, as a simple idea to produce compact data visualizations. Vizketches combine algorithmic techniques for data summarization with computer graphics principles for efficient rendering. While simple, vizketches are effective at scaling the spreadsheet by parallelizing computation, reducing communication, providing progressive visualizations, and offering precise accuracy guarantees. Using Hillview running on eight servers, we can navigate and visualize datasets of tens of billions of rows and trillions of cells, much beyond the published capabilities of competing systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wei:2019:EFD, author = "Ziheng Wei and Sebastian Link", title = "Embedded functional dependencies and data-completeness tailored database design", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1458--1470", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342626", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We establish a robust schema design framework for data with missing values. The framework is based on the new notion of an embedded functional dependency, which is independent of the interpretation of missing values, able to express completeness and integrity requirements on application data, and capable of capturing many redundant data value occurrences. We establish axiomatic and algorithmic foundations for reasoning about embedded functional dependencies. These foundations allow us to establish generalizations of Boyce-Codd and Third normal forms that do not permit any redundancy in any future application data, or minimize their redundancy across dependency-preserving decompositions, respectively. We show how to transform any given schema into application schemata that meet given completeness and integrity requirements and the conditions of the generalized normal forms. Data over those application schemata are therefore fit for purpose by design. Extensive experiments with benchmark schemata and data illustrate our framework, and the effectiveness and efficiency of our algorithms, but also provide quantified insight into database schema design trade-offs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fan:2019:OVG, author = "Hua Fan and Wojciech Golab", title = "{Ocean Vista}: gossip-based visibility control for speedy geo-distributed transactions", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1471--1484", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342627", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Providing ACID transactions under conflicts across globally distributed data is the Everest of transaction processing protocols. Transaction processing in this scenario is particularly costly due to the high latency of cross-continent network links, which inflates concurrency control and data replication overheads. To mitigate the problem, we introduce Ocean Vista --- a novel distributed protocol that guarantees strict serializability. We observe that concurrency control and replication address different aspects of resolving the visibility of transactions, and we address both concerns using a multi-version protocol that tracks visibility using version watermarks and arrives at correct visibility decisions using efficient gossip. Gossiping the watermarks enables asynchronous transaction processing and acknowledging transaction visibility in batches in the concurrency control and replication protocols, which improves efficiency under high cross-datacenter network delays. In particular, Ocean Vista can process conflicting transactions in parallel, and supports efficient write-quorum / read-one access using one round trip in the common case. We demonstrate experimentally in a multi-data-center cloud environment that our design outperforms a leading distributed transaction processing engine (TAPIR) more than 10-fold in terms of peak throughput, albeit at the cost of additional latency for gossip. The latency penalty is generally bounded by one wide area network (WAN) round trip time (RTT), and in the best case (i.e., under light load) our system nearly breaks even with TAPIR by committing transactions in around one WAN RTT.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2019:INF, author = "Xikui Wang and Michael J. Carey", title = "An {IDEA}: an ingestion framework for data enrichment in {asterixDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1485--1498", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342628", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Big Data today is being generated at an unprecedented rate from various sources such as sensors, applications, and devices, and it often needs to be enriched based on other reference information to support complex analytical queries. Depending on the use case, the enrichment operations can be compiled code, declarative queries, or machine learning models with different complexities. For enrichments that will be frequently used in the future, it can be advantageous to push their computation into the ingestion pipeline so that they can be stored (and queried) together with the data. In some cases, the referenced information may change over time, so the ingestion pipeline should be able to adapt to such changes to guarantee the currency and/or correctness of the enrichment results. In this paper, we present a new data ingestion framework that supports data ingestion at scale, enrichments requiring complex operations, and adaptiveness to reference data changes. We explain how this framework has been built on top of Apache AsterixDB and investigate its performance at scale under various workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Karyakin:2019:DMP, author = "Alexey Karyakin and Kenneth Salem", title = "{DimmStore}: memory power optimization for database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1499--1512", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.33422629", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Memory can consume a substantial amount of power in database servers, yet memory power has received considerably less attention than CPU power. Memory power consumption is also highly non-proportional. Thus, memory power becomes even more significant in the common case in which a database server is either not completely busy or not completely full. In this paper, we study the application of two memory power optimization techniques --- rank-aware allocation and rate-based layout --- to database systems. By concentrating memory load, rather than spreading it out evenly, these techniques create and exploit memory idleness to achieve power savings. We have implemented these techniques in a prototype database system called DimmStore. DimmStore is part of a memory power testbed which includes customized hardware with direct power measurement capabilities, allowing us to measure the techniques' effectiveness. We use the testbed to empirically characterize the power saving opportunities provided by these techniques, as well as their performance impact, under YCSB and TPC-C workloads. Under simple YCSB workloads, power savings ranged up to 50\%, depending on load and space utilization, with little performance impact. Savings were smaller, but still significant, for TPC-C, which has more complex data locality characteristics.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yan:2019:GAS, author = "Cong Yan and Alvin Cheung", title = "Generating application-specific data layouts for in-memory databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1513--1525", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342630", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database applications are often developed with object-oriented languages while using relational databases as the backend. To accelerate these applications, developers would manually design customized data structures to store data in memory, and ways to utilize such data structures to answer queries. Doing so is brittle and requires a lot of effort. Alternatively, developers might automate the process by using relational physical design tools to create materialized views and indexes instead. However, the characteristics of object-oriented database applications are often distinct enough from traditional database applications such that classical relational query optimization techniques often cannot speed up queries that arise from such applications, as our experiments show. To address this, we build Chestnut, a data layout generator for in-memory object-oriented database applications. Given a memory budget, Chestnut generates customized in-memory data layouts and query plans to answer queries written using a subset of the Rails API, a common framework for building object-oriented database applications. Chestnut differs from traditional query optimizers and physical designers in two ways. First, Chestnut automatically generates data layouts that are customized for the application after analyzing their queries, hence Chestnut-generated data layouts are designed to be efficient to answer queries from such applications. second, Chestnut uses a novel enumeration and verification-based algorithm to generate query plans that use such data layouts, rather than rule-based approaches as in traditional query optimizers. We evaluated Chestnut on four open-source Rails database applications. The result shows that it can reduce average query processing time by over 3.6X (and up to 42X), as compared to other in-memory relational database engines.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hai:2019:RPT, author = "Rihan Hai and Christoph Quix", title = "Rewriting of plain {SO} tgds into nested tgds", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1526--1538", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342631", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Schema mappings express the relationships between sources in data interoperability scenarios and can be expressed in various formalisms. Source-to-target tuple-generating dependencies (s-t tgds) can be easily used for data transformation or query rewriting tasks. Second-order tgds (SO tgds) are more expressive as they can also represent the composition and inversion of s-t tgds. Yet, the expressive power of SO tgds comes with the problem of undecidability for some reasoning tasks. Nested tgds and plain SO tgds are mapping languages that are between s-t tgds and SO tgds in terms of expressivity, and their properties have been studied in the recent years. Nested tgds are less expressive than plain SO tgds, but the logical equivalence problem for nested tgds is decidable. However, a detailed characterization of plain SO tgds that have an equivalent nested tgd is missing. In this paper, we present an algorithmic solution for translating plain SO tgds into nested tgds. The algorithm computes one or more nested tgds, if a given plain SO tgd is rewritable. Furthermore, we are able to give a detailed characterization of those plain SO tgds for which an equivalent nested tgd exists, based on the structural properties of the source predicates and Skolem functions in the plain SO tgd. In the evaluation, we show that our algorithm covers a larger subset of plain SO tgds than previous approaches and that a rewriting can be computed efficiently although the algorithm has the exponential complexity.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nathan:2019:BMD, author = "Senthil Nathan and Chander Govindarajan and Adarsh Saraf and Manish Sethi and Praveen Jayachandran", title = "Blockchain meets database: design and implementation of a blockchain relational database", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1539--1552", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342632", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we design and implement the first-ever decentralized replicated relational database with blockchain properties that we term blockchain relational database. We highlight several similarities between features provided by blockchain platforms and a replicated relational database, although they are conceptually different, primarily in their trust model. Motivated by this, we leverage the rich features, decades of research and optimization, and available tooling in relational databases to build a blockchain relational database. We consider a permissioned blockchain model of known, but mutually distrustful organizations each operating their own database instance that are replicas of one another. The replicas execute transactions independently and engage in decentralized consensus to determine the commit order for transactions. We design two approaches, the first where the commit order for transactions is agreed upon prior to executing them, and the second where transactions are executed without prior knowledge of the commit order while the ordering happens in parallel. We leverage serializable snapshot isolation (SSI) to guarantee that the replicas across nodes remain consistent and respect the ordering determined by consensus, and devise a new variant of SSI based on block height for the latter approach. We implement our system on PostgreSQL and present detailed performance experiments analyzing both approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kunft:2019:IRO, author = "Andreas Kunft and Asterios Katsifodimos and Sebastian Schelter and Sebastian Bre{\ss} and Tilmann Rabl and Volker Markl", title = "An intermediate representation for optimizing machine learning pipelines", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1553--1567", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342633", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Machine learning (ML) pipelines for model training and validation typically include preprocessing, such as data cleaning and feature engineering, prior to training an ML model. Preprocessing combines relational algebra and user-defined functions (UDFs), while model training uses iterations and linear algebra. Current systems are tailored to either of the two. As a consequence, preprocessing and ML steps are optimized in isolation. To enable holistic optimization of ML training pipelines, we present Lara, a declarative domain-specific language for collections and matrices. Lara's inter-mediate representation (IR) reflects on the complete program, i.e., UDFs, control flow, and both data types. Two views on the IR enable diverse optimizations. Monads enable operator pushdown and fusion across type and loop boundaries. Combinators provide the semantics of domain-specific operators and optimize data access and cross-validation of ML algorithms. Our experiments on preprocessing pipelines and selected ML algorithms show the effects of our proposed optimizations on dense and sparse data, which achieve speedups of up to an order of magnitude.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fang:2019:ARD, author = "Yuanwei Fang and Chen Zou and Andrew A. Chien", title = "Accelerating raw data analysis with the {ACCORDA} software and hardware architecture", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1568--1582", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342634", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The data science revolution and growing popularity of data lakes make efficient processing of raw data increasingly important. To address this, we propose the ACCelerated Operators for Raw Data Analysis (ACCORDA) architecture. By extending the operator interface (subtype with encoding) and employing a uniform runtime worker model, ACCORDA integrates data transformation acceleration seamlessly, enabling a new class of encoding optimizations and robust high-performance raw data processing. Together, these key features preserve the software system architecture, empowering state-of-art heuristic optimizations to drive flexible data encoding for performance. ACCORDA derives performance from its software architecture, but depends critically on the acceleration of the Unstructured Data Processor (UDP) that is integrated into the memory-hierarchy, and accelerates data transformation tasks by 16x-21x (parsing, decompression) to as much as 160x (deserialization) compared to an x86 core. We evaluate ACCORDA using TPC-H queries on tabular data formats, exercising raw data properties such as parsing and data conversion. The ACCORDA system achieves 2.9x-13.2x speedups when compared to SparkSQL, reducing raw data processing overhead to a geomean of 1.2x (20\%). In doing so, ACCORDA robustly matches or outperforms prior systems that depend on caching loaded data, while computing on raw, unloaded data. This performance benefit is robust across format complexity, query predicates, and selectivity (data statistics). ACCORDA's encoding-extended operator interface unlocks aggressive encoding-oriented optimizations that deliver 80\% average performance increase over the 7 affected TPC-H queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Siddique:2019:CST, author = "A. B. Siddique and Ahmed Eldawy and Vagelis Hristidis", title = "Comparing synopsis techniques for approximate spatial data analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1583--1596", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342635", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The increasing amount of spatial data calls for new scalable query processing techniques. One of the techniques that are getting attention is data synopsis, which summarizes the data using samples or histograms and computes an approximate answer based on the synopsis. This general technique is used in selectivity estimation, clustering, partitioning, load balancing, and visualization, among others. This paper experimentally studies four spatial data synopsis techniques for three common data analysis problems, namely, selectivity estimation, k-means clustering, and spatial partitioning. We run an extensive experimental evaluation on both real and synthetic datasets of up to 2.7 billion records to study the trade-offs between the synopsis methods and their applicability in big spatial data analysis. For each of the three problems, we compare with baseline techniques that operate on the whole dataset and evaluate the synopsis generation time, the time for computing an approximate answer on the synopsis, and the accuracy of the result. We present our observations about when each synopsis technique performs best.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{El-Hindi:2019:BSD, author = "Muhammad El-Hindi and Carsten Binnig and Arvind Arasu and Donald Kossmann and Ravi Ramamurthy", title = "{BlockchainDB}: a shared database on blockchains", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1597--1609", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342636", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper we present BlockchainDB, which leverages blockchains as a storage layer and introduces a database layer on top that extends blockchains by classical data management techniques (e.g., sharding) as well as a standardized query interface to facilitate the adoption of blockchains for data sharing use cases. We show that by introducing the additional database layer, we are able to improve the performance and scalability when using blockchains for data sharing and also massively decrease the complexity for organizations intending to use blockchains for data sharing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jia:2019:ETS, author = "Ruoxi Jia and David Dao and Boxin Wang and Frances Ann Hubis and Nezihe Merve Gurel and Bo Li and Ce Zhang and Costas Spanos and Dawn Song", title = "Efficient task-specific data valuation for nearest neighbor algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1610--1623", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342637", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a data set D containing millions of data points and a data consumer who is willing to pay for \$X to train a machine learning (ML) model over D, how should we distribute this \$X to each data point to reflect its ``value''? In this paper, we define the ``relative value of data'' via the Shapley value, as it uniquely possesses properties with appealing real-world interpretations, such as fairness, rationality and decentralizability. For general, bounded utility functions, the Shapley value is known to be challenging to compute: to get Shapley values for all N data points, it requires O (2$^N$ ) model evaluations for exact computation and O ( N log N ) for ( \epsilon, \delta )-approximation. In this paper, we focus on one popular family of ML models relying on K -nearest neighbors ( K NN). The most surprising result is that for unweighted K NN classifiers and regressors, the Shapley value of all N data points can be computed, exactly, in O ( N log N ) time --- an exponential improvement on computational complexity! Moreover, for ( \epsilon, \delta )-approximation, we are able to develop an algorithm based on Locality Sensitive Hashing (LSH) with only sublinear complexity O ( N$^{h (\epsilon, K)}$ log N ) when \epsilon is not too small and K is not too large. We empirically evaluate our algorithms on up to 10 million data points and even our exact algorithm is up to three orders of magnitude faster than the baseline approximation algorithm. The LSH-based approximation algorithm can accelerate the value calculation process even further. We then extend our algorithm to other scenarios such as (1) weighed K NN classifiers, (2) different data points are clustered by different data curators, and (3) there are data analysts providing computation who also requires proper valuation. Some of these extensions, although also being improved exponentially, are less practical for exact computation (e.g., O ( N$^K$ ) complexity for weighted K NN). We thus propose an Monte Carlo approximation algorithm, which is O ( N (log N )$^2$ /(log K )$^2$ ) times more efficient than the baseline approximation algorithm.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Saxena:2019:DID, author = "Hemant Saxena and Lukasz Golab and Ihab F. Ilyas", title = "Distributed implementations of dependency discovery algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1624--1636", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342638", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We analyze the problem of discovering dependencies from distributed big data. Existing (non-distributed) algorithms focus on minimizing computation by pruning the search space of possible dependencies. However, distributed algorithms must also optimize communication costs, especially in shared-nothing settings, leading to a more complex optimization space. To understand this space, we introduce six primitives shared by existing dependency discovery algorithms, corresponding to data processing steps separated by communication barriers. Through case studies, we show how the primitives allow us to analyze the design space and develop communication-optimized implementations. Finally, we support our analysis with an experimental evaluation on real datasets.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zamanian:2019:RDH, author = "Erfan Zamanian and Xiangyao Yu and Michael Stonebraker and Tim Kraska", title = "Rethinking database high availability with {RDMA} networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1637--1650", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342639", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Highly available database systems rely on data replication to tolerate machine failures. Both classes of existing replication algorithms, active-passive and active-active, were designed in a time when network was the dominant performance bottleneck. In essence, these techniques aim to minimize network communication between replicas at the cost of incurring more processing redundancy; a trade-off that suitably fitted the conventional wisdom of distributed database design. However, the emergence of next-generation networks with high throughput and low latency calls for revisiting these assumptions. In this paper, we first make the case that in modern RDMA-enabled networks, the bottleneck has shifted to CPUs, and therefore the existing network-optimized replication techniques are no longer optimal. We present Active-Memory Replication, a new high availability scheme that efficiently leverages RDMA to completely eliminate the processing redundancy in replication. Using Active-Memory, all replicas dedicate their processing power to executing new transactions, as opposed to performing redundant computation. Active-Memory maintains high availability and correctness in the presence of failures through an efficient RDMA-based undo-logging scheme. Our evaluation against active-passive and active-active schemes shows that Active-Memory is up to a factor of 2 faster than the second-best protocol on RDMA-based networks.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bressan:2019:MFM, author = "Marco Bressan and Stefano Leucci and Alessandro Panconesi", title = "{Motivo}: fast motif counting via succinct color coding and adaptive sampling", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1651--1663", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342640", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The randomized technique of color coding is behind state-of-the-art algorithms for estimating graph motif counts. Those algorithms, however, are not yet capable of scaling well to very large graphs with billions of edges. In this paper we develop novel tools for the ``motif counting via color coding'' framework. As a result, our new algorithm, MOTIYO, scales to much larger graphs while at the same time providing more accurate motif counts than ever before. This is achieved thanks to two types of improvements. First, we design new succinct data structures for fast color coding operations, and a biased coloring trick that trades accuracy versus resource usage. These optimizations drastically reduce the resource requirements of color coding. Second, we develop an adaptive motif sampling strategy, based on a fractional set cover problem, that breaks the additive approximation barrier of standard sampling. This gives multiplicative approximations for all motifs at once, allowing us to count not only the most frequent motifs but also extremely rare ones. To give an idea of the improvements, in 40 minutes MOTIVO counts 7-nodes motifs on a graph with 65M nodes and 1.8B edges; this is 30 and 500 times larger than the state of the art, respectively in terms of nodes and edges. On the accuracy side, in one hour MOTIVO produces accurate counts of \approx 10.000 distinct 8-node motifs on graphs where state-of-the-art algorithms fail even to find the second most frequent motif. Our method requires just a high-end desktop machine. These results show how color coding can bring motif mining to the realm of truly massive graphs using only ordinary hardware.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Poddar:2019:AED, author = "Rishabh Poddar and Tobias Boelter and Raluca Ada Popa", title = "{Arx}: an encrypted database using semantically secure encryption", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1664--1678", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342641", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In recent years, encrypted databases have emerged as a promising direction that provides data confidentiality without sacrificing functionality: queries are executed on encrypted data. However, many practical proposals rely on a set of weak encryption schemes that have been shown to leak sensitive data. In this paper, we propose Arx, a practical and functionally rich database system that encrypts the data only with semantically secure encryption schemes. We show that Arx supports real applications such as ShareLaTeX with a modest performance overhead.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gao:2019:EKG, author = "Junyang Gao and Xian Li and Yifan Ethan Xu and Bunyamin Sisman and Xin Luna Dong and Jun Yang", title = "Efficient knowledge graph accuracy evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1679--1691", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342642", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Estimation of the accuracy of a large-scale knowledge graph (KG) often requires humans to annotate samples from the graph. How to obtain statistically meaningful estimates for accuracy evaluation while keeping human annotation costs low is a problem critical to the development cycle of a KG and its practical applications. Surprisingly, this challenging problem has largely been ignored in prior research. To address the problem, this paper proposes an efficient sampling and evaluation framework, which aims to provide quality accuracy evaluation with strong statistical guarantee while minimizing human efforts. Motivated by the properties of the annotation cost function observed in practice, we propose the use of cluster sampling to reduce the overall cost. We further apply weighted and two-stage sampling as well as stratification for better sampling designs. We also extend our framework to enable efficient incremental evaluation on evolving KG, introducing two solutions based on stratified sampling and a weighted variant of reservoir sampling. Extensive experiments on real-world datasets demonstrate the effectiveness and efficiency of our proposed solution. Compared to baseline approaches, our best solutions can provide up to 60\% cost reduction on static KG evaluation and up to 80\% cost reduction on evolving KG evaluation, without loss of evaluation quality.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Mhedhbi:2019:OSQ, author = "Amine Mhedhbi and Semih Salihoglu", title = "Optimizing subgraph queries by combining binary and worst-case optimal joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1692--1704", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342643", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the problem of optimizing subgraph queries using the new worst-case optimal join plans. Worst-case optimal plans evaluate queries by matching one query vertex at a time using multi-way intersections. The core problem in optimizing worst-case optimal plans is to pick an ordering of the query vertices to match. We design a cost-based optimizer that (i) picks efficient query vertex orderings for worst-case optimal plans; and (ii) generates hybrid plans that mix traditional binary joins with worst-case optimal style multiway intersections. Our cost metric combines the cost of binary joins with a new cost metric called intersection-cost. The plan space of our optimizer contains plans that are not in the plan spaces based on tree decompositions from prior work. In addition to our optimizer, we describe an adaptive technique that changes the orderings of the worst-case optimal subplans during query execution. We demonstrate the effectiveness of the plans our optimizer picks and the effectiveness of the adaptive technique through extensive experiments. Our optimizer is integrated into the Graphflow DBMS.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Marcus:2019:NLQ, author = "Ryan Marcus and Parimarjan Negi and Hongzi Mao and Chi Zhang and Mohammad Alizadeh and Tim Kraska and Olga Papaemmanouil and Nesime Tatbul", title = "{Neo}: a learned query optimizer", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1705--1718", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342644", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query optimization is one of the most challenging problems in database systems. Despite the progress made over the past decades, query optimizers remain extremely complex components that require a great deal of hand-tuning for specific workloads and datasets. Motivated by this shortcoming and inspired by recent advances in applying machine learning to data management challenges, we introduce Neo ( Neural Optimizer ), a novel learning-based query optimizer that relies on deep neural networks to generate query executions plans. Neo bootstraps its query optimization model from existing optimizers and continues to learn from incoming queries, building upon its successes and learning from its failures. Furthermore, Neo naturally adapts to underlying data patterns and is robust to estimation errors. Experimental results demonstrate that Neo, even when bootstrapped from a simple optimizer like PostgreSQL, can learn a model that offers similar performance to state-of-the-art commercial optimizers, and in some cases even surpass them.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fang:2019:EAD, author = "Yixiang Fang and Kaiqiang Yu and Reynold Cheng and Laks V. S. Lakshmanan and Xuemin Lin", title = "Efficient algorithms for densest subgraph discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1719--1732", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342645", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Densest subgraph discovery (DSD) is a fundamental problem in graph mining. It has been studied for decades, and is widely used in various areas, including network science, biological analysis, and graph databases. Given a graph G, DSD aims to find a subgraph D of G with the highest density (e.g., the number of edges over the number of vertices in D ). Because DSD is difficult to solve, we propose a new solution paradigm in this paper. Our main observation is that the densest subgraph can be accurately found through a k -core (a kind of dense subgraph of G ), with theoretical guarantees. Based on this intuition, we develop efficient exact and approximation solutions for DSD. Moreover, our solutions are able to find the densest subgraphs for a wide range of graph density definitions, including clique-based- and general pattern-based density. We have performed extensive experimental evaluation on both real and synthetic datasets. Our results show that our algorithms are up to four orders of magnitude faster than existing approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Marcus:2019:PSD, author = "Ryan Marcus and Olga Papaemmanouil", title = "Plan-structured deep neural network models for query performance prediction", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1733--1746", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342646", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query performance prediction, the task of predicting a query's latency prior to execution, is a challenging problem in database management systems. Existing approaches rely on features and performance models engineered by human experts, but often fail to capture the complex interactions between query operators and input relations, and generally do not adapt naturally to workload characteristics and patterns in query execution plans. In this paper, we argue that deep learning can be applied to the query performance prediction problem, and we introduce a novel neural network architecture for the task: a plan-structured neural network. Our neural network architecture matches the structure of any optimizer-selected query execution plan and predict its latency with high accuracy, while eliminating the need for human-crafted input features. A number of optimizations are also proposed to reduce training overhead without sacrificing effectiveness. We evaluated our techniques on various workloads and we demonstrate that our approach can out-perform the state-of-the-art in query performance prediction.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ren:2019:SSL, author = "Kun Ren and Dennis Li and Daniel J. Abadi", title = "{SLOG}: serializable, low-latency, geo-replicated transactions", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1747--1761", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342647", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "For decades, applications deployed on a world-wide scale have been forced to give up at least one of (1) strict serializability (2) low latency writes (3) high transactional throughput. In this paper we discuss SLOG: a system that avoids this tradeoff for workloads which contain physical region locality in data access. SLOG achieves high-throughput, strictly serializable ACID transactions at geo-replicated distance and scale for all transactions submitted across the world, all the while achieving low latency for transactions that initiate from a location close to the home region for data they access. Experiments find that SLOG can reduce latency by more than an order of magnitude relative to state-of-the-art strictly serializable geo-replicated database systems such as Spanner and Calvin, while maintaining high throughput under contention.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Paparrizos:2019:GET, author = "John Paparrizos and Michael J. Franklin", title = "{GRAIL}: efficient time-series representation learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "11", pages = "1762--1777", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3342263.3342648", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The analysis of time series is becoming increasingly prevalent across scientific disciplines and industrial applications. The effectiveness and the scalability of time-series mining techniques critically depend on design choices for three components responsible for (i) representing; (ii) comparing; and (iii) indexing time series. Unfortunately, these components have to date been investigated and developed independently, often resulting in mutually incompatible methods. The lack of a unified approach has hindered progress towards fast and accurate analytics over massive time-series collections. To address this major drawback, we present GRAIL, a generic framework to learn compact time-series representations that preserve the properties of a user-specified comparison function. Given the comparison function, GRAIL (i) extracts landmark time series using clustering; (ii) optimizes necessary parameters; and (iii) exploits approximations for kernel methods to construct representations in linear time and space by expressing each time series as a combination of the landmark time series. We extensively evaluate GRAIL for querying, classification, clustering, sampling, and visualization of time series. For these tasks, methods leveraging GRAIL's representations are significantly faster and at least as accurate as state-of-the-art methods operating over the raw time series. GRAIL shows promise as a new primitive for highly accurate, yet scalable, time-series analysis.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Damasio:2019:GGA, author = "Guilherme Damasio and Spencer Bryson and Vincent Corvinelli and Parke Godfrey and Piotr Mierzejewski and Jaroslaw Szlichta and Calisto Zuzarte", title = "{GALO}: guided automated learning for re-optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1778--1781", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352064", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query performance problem determination is usually performed manually in consultation with experts through the analysis of query plans. However, this is an excessively time consuming, human error-prone, and costly process. GALO is a novel system that automates this process. The tool automatically learns recurring problem patterns in query plans over workloads in an offline learning phase to build a knowledge base of plan rewrite remedies. GALO's knowledge base is built on RDF and SPARQL, which is well-suited for manipulating and querying over SQL query plans, which are graphs themselves. It then uses the knowledge base online to re-optimize queries queued for execution to improve performance, often quite dramatically.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tian:2019:SGS, author = "Yuanyuan Tian and Wen Sun and Sui Jun Tong and En Liang Xu and Mir Hamid Pirahesh and Wei Zhao", title = "Synergistic graph and {SQL} analytics inside {IBM Db2}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1782--1785", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352065", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To meet the challenge of analyzing rapidly growing graph and network data created by modern applications, a large number of specialized graph databases have emerged, such as Neo4j, JanusGraph, and Sqlg. At the same time, RDBMSs and SQL continue to support mission-critical business analytics. However, real-life analytical applications seldom contain only one type of analytics. They are often made of heterogeneous workloads, including SQL, machine learning, graph, and other analytics. In particular, SQL and graph analytics are usually accompanied together in one analytical workload. This means that graph and SQL analytics need to be synergistic with each other. Unfortunately, most existing graph databases are standalone and cannot easily integrate with relational databases. In addition, as a matter of fact, many graph data (data about relationships between objects or people) are already prevalent in relational databases, although they are not explicitly stored as graphs. Performing graph analytics on these relational graph data today requires exporting large amount of data to the specialized graph databases. A natural question arises: can SQL and graph analytics be performed synergistically in a same system? In this demo, we present such a working system called IBM Db2 Graph. Db2 Graph is an in-DBMS graph query approach. It is implemented as a layer inside an experimental IBM Db2TM, and thus can support synergistic graph and SQL analytics efficiently. Db2 Graph employs a graph overlay approach to expose a graph view of the relational data. This approach flexibly retrofits graph queries to existing graph data stored in relational tables. We use an example scenario on health insurance claim analysis to demonstrate how Db2 Graph is used to support synergistic graph and SQL analytics inside Db2.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ding:2019:CDC, author = "Xiaoou Ding and Hongzhi Wang and Jiaxuan Su and Zijue Li and Jianzhong Li and Hong Gao", title = "{Cleanits}: a data cleaning system for industrial time series", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1786--1789", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352066", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The great amount of time series generated by machines has enormous value in intelligent industry. Knowledge can be discovered from high-quality time series, and used for production optimization and anomaly detection in industry. However, the original sensors data always contain many errors. This requires a sophisticated cleaning strategy and a well-designed system for industrial data cleaning. Motivated by this, we introduce Cleanits, a system for industrial time series cleaning. It implements an integrated cleaning strategy for detecting and repairing three kinds of errors in industrial time series. We develop reliable data cleaning algorithms, considering features of both industrial time series and domain knowledge. We demonstrate Cleanits with two real datasets from power plants. The system detects and repairs multiple dirty data precisely, and improves the quality of industrial time series effectively. Cleanits has a friendly interface for users, and result visualization along with logs are available during each cleaning process.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2019:IIT, author = "Yipeng Zhang and Zhifeng Bao and Songsong Mo and Yuchen Li and Yanghao Zhou", title = "{ITAA}: an intelligent trajectory-driven outdoor advertising deployment assistant", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1790--1793", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352067", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we demonstrate an Intelligent Trajectory-driven outdoor Advertising deployment Assistant (ITAA), which assists users to find an optimal strategy for outdoor advertising (ad) deployment. The challenge is how to measure the influence to the moving trajectories of ads, and how to optimize the placement of ads among billboards that maximize the influence has been proven NP-hard. Therefore, we develop a framework based on two trajectory-driven influence models. ITAA is built upon this framework with a user-friendly UI. It serves both ad companies and their customers. We enhance the interpretability to improve the user's understanding of the influence of ads. The interactive function of ITAA is made interpretable and easy to engage.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Qian:2019:SHL, author = "Kun Qian and Lucian Popa and Prithviraj Sen", title = "{SystemER}: a human-in-the-loop system for explainable entity resolution", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1794--1797", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352068", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Entity Resolution (ER) is the task of identifying different representations of the same real-world object. To achieve scalability and the desired level of quality, the typical ER pipeline includes multiple steps that may involve low-level coding and extensive human labor. We present SystemER, a tool for learning explainable ER models that reduces the human labor all throughout the stages of the ER pipeline. SystemER achieves explainability by learning rules that not only perform a given ER task but are human-comprehensible; this provides transparency into the learning process, and further enables verification and customization of the learned model by the domain experts. By leveraging a human in the loop and active learning, SystemER also ensures that a small number of labeled examples is sufficient to learn high-quality ER models. SystemER is a full-fledged tool that includes an easy to use interface, support for both flat files and semi-structured data, and scale-out capabilities by distributing computation via Apache Spark.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huynh:2019:BEF, author = "Viet-Phi Huynh and Paolo Papotti", title = "{Buckle}: evaluating fact checking algorithms built on knowledge bases", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1798--1801", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352069", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Fact checking is the task of determining if a given claim holds. Several algorithms have been developed to check facts with reference information in the form of knowledge bases. We demonstrate BUCKLE, an open-source benchmark for comparing and evaluating fact checking algorithms in a level playing field across a range of scenarios. The demo is centered around three main lessons. To start, we show how, by changing the properties of the training and test facts, it is possible to influence significantly the performance of the algorithms. We then show the role of the reference data. Finally, we discuss the performance for algorithms designed on different principles and assumptions, as well as approaches that address the link prediction task in knowledge bases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gao:2019:QSE, author = "Peng Gao and Xusheng Xiao and Zhichun Li and Kangkook Jee and Fengyuan Xu and Sanjeev R. Kulkarni and Prateek Mittal", title = "A query system for efficiently investigating complex attack behaviors for enterprise security", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1802--1805", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352070", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The need for countering Advanced Persistent Threat (APT) attacks has led to the solutions that ubiquitously monitor system activities in each enterprise host, and perform timely attack investigation over the monitoring data for uncovering the attack sequence. However, existing general-purpose query systems lack explicit language constructs for expressing key properties of major attack behaviors, and their semantics-agnostic design often produces inefficient execution plans for queries. To address these limitations, we build Aiql, a novel query system that is designed with novel types of domain-specific optimizations to enable efficient attack investigation. Aiql provides (1) a domain-specific data model and storage for storing the massive system monitoring data, (2) a domain-specific query language, Attack Investigation Query Language (Aiql) that integrates critical primitives for expressing major attack behaviors, and (3) an optimized query engine based on the characteristics of the data and the semantics of the query to efficiently schedule the execution. We have deployed Aiql in NEC Labs America comprising 150 hosts. In our demo, we aim to show the complete usage scenario of Aiql by (1) performing an APT attack in a controlled environment, and (2) using Aiql to investigate such attack by querying the collected system monitoring data that contains the attack traces. The audience will have the option to perform the APT attack themselves under our guidance, and interact with the system and investigate the attack via issuing queries and checking the query results through our web UI.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Miao:2019:CEO, author = "Zhengjie Miao and Qitian Zeng and Chenjie Li and Boris Glavic and Oliver Kennedy and Sudeepa Roy", title = "{CAPE}: explaining outliers by counterbalancing", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1806--1809", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352071", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demonstration we showcase Cape, a system that explains surprising aggregation outcomes. In contrast to previous work, which relies exclusively on provenance, Cape explains outliers in aggregation queries through related outliers in the opposite direction that provide counterbalance. The foundation of our approach are aggregate regression patterns (ARPs) that describe coarse-grained trends in the data. We define outliers as deviations from such patterns and present an efficient algorithm to find counterbalances explaining outliers. In the demonstration, the audience can run aggregation queries over real world datasets, identify outliers of interest in the result of such queries, and browse the patterns and explanations returned by Cape.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ramachandra:2019:BAI, author = "Karthik Ramachandra and Kwanghyun Park", title = "{BlackMagic}: automatic inlining of scalar {UDFs} into {SQL} queries with {Froid}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1810--1813", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352072", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Relational DBMSs allow users to extend the standard declarative SQL language surface using User Defined Functions (UDFs) that implement custom behavior. While UDFs offer many advantages, it is well-known amongst practitioners that they can cause severe degradation in query performance. This degradation is due to the fact that state-of-the-art query optimizers treat UDFs as black boxes and do not reason about them during optimization. We demonstrate Froid, a framework for optimizing UDFs by opening up this black box and exposing its underlying operations to the query optimizer. It achieves this by systematically translating the entire body of an imperative multi-statement UDF into a single relational algebraic expression. Thereby, any query invoking this UDF is transformed into a query with a nested sub-query that is semantically equivalent to the UDF. We then leverage existing sub-query optimization techniques and thereby get efficient, set-oriented, parallel query plans as opposed to inefficient, iterative, serial execution of UDFs. We demonstrate the benefits of Froid including performance gains of up to multiple orders of magnitude on real workloads. Froid is available as a feature of Microsoft SQL Server 2019 called 'Scalar UDF Inlining'.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Berg:2019:PPD, author = "Lukas Berg and Tobias Ziegler and Carsten Binnig and Uwe R{\"o}hm", title = "{ProgressiveDB}: progressive data analytics as a middleware", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1814--1817", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352073", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "ProgressiveDB transforms any standard SQL database into a progressive database capable of continuous, approximate query processing. It introduces a few small extensions to the SQL query language that allow clients to express progressive analytical queries. These extensions are processed in the ProgressiveDB middleware that sits between a database application and the underlying database providing interactive query processing as well as query steering capabilities to the user. In our demo, we show how this system allows a database application with a graphical user interface to interact with different backends, while providing the user with immediate feedback during exploratory data exploration of an on-time flight database. ProgressiveDB also supports efficient query steering by providing a new technique, called progressive views, which allows the intermediate results of one progressive query to be shared and reused by multiple concurrent progressive queries with refined scope.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kara:2019:DHT, author = "Kaan Kara and Zeke Wang and Ce Zhang and Gustavo Alonso", title = "{doppioDB 2.0}: hardware techniques for improved integration of machine learning into databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1818--1821", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352074", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database engines are starting to incorporate machine learning (ML) functionality as part of their repertoire. Machine learning algorithms, however, have very different characteristics than those of relational operators. In this demonstration, we explore the challenges that arise when integrating generalized linear models into a database engine and how to incorporate hardware accelerators into the execution, a tool now widely used for ML workloads. The demo explores two complementary alternatives: (1) how to train models directly on compressed/encrypted column-stores using a specialized coordinate descent engine, and (2) how to use a bitwise weaving index for stochastic gradient descent on low precision input data. We present these techniques as implemented in our prototype database doppioDB 2.0 and show how the new functionality can be used from SQL.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pahins:2019:CSV, author = "Cicero A. L. Pahins and Behrooz Omidvar-Tehrani and Sihem Amer-Yahia and Val{\'e}rie Siroux and Jean-Louis Pepin and Jean-Christian Borel and Jo{\~a}o L. D. Comba", title = "{COVIZ}: a system for visual formation and exploration of patient cohorts", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1822--1825", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352075", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate COVIZ, an interactive system to visually form and explore patient cohorts. COVIZ seamlessly integrates visual cohort formation and exploration, making it a single destination for hypothesis generation. COVIZ is easy to use by medical experts and offers many features: (1) It provides the ability to isolate patient demographics (e.g., their age group and location), health markers (e.g., their body mass index), and treatments (e.g., Ventilation for respiratory problems), and hence facilitates cohort formation; (2) It summarizes the evolution of treatments of a cohort into health trajectories, and lets medical experts explore those trajectories; (3) It guides them in examining different facets of a cohort and generating hypotheses for future analysis; (4) Finally, it provides the ability to compare the statistics and health trajectories of multiple cohorts at once. COVIZ relies on QDS, a novel data structure that encodes and indexes various data distributions to enable their efficient retrieval. Additionally, COVIZ visualizes air quality data in the regions where patients live to help with data interpretations. We demonstrate two key scenarios, ecological scenario and case cross-over scenario. A video demonstration of COVIZ is accessible via http://bit.ly/video-coviz.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Franke:2019:PTF, author = "Martin Franke and Ziad Sehili and Erhard Rahm", title = "{PRIMAT}: a toolbox for fast privacy-preserving matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1826--1829", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352076", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Privacy-preserving record linkage (PPRL) is increasingly demanded in real-world applications, e.g., in the health-care domain, to combine person-related data for data analysis while preserving the privacy of individuals. However, the adoption of PPRL is hampered by the absence of easy-to-use and powerful PPRL tools covering the entire PPRL process. We therefore demonstrate Primat, a flexible and scalable tool that enables the definition and application of tailored PPRL workflows as well as the comparative evaluation of different PPRL methods. We introduce the main requirements for PPRL tools and discuss previous tool efforts that do not fully meet the requirements and have not been applied in practice. By contrast, Primat covers the whole PPRL life-cycle and improves applicability by providing various components for data owners and the central linkage to be executed by a trusted linkage unit.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Marcus:2019:NFR, author = "Ryan Marcus and Chi Zhang and Shuai Yu and Geoffrey Kao and Olga Papaemmanouil", title = "{NashDB}: fragmentation, replication, and provisioning using economic methods", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1830--1833", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352077", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern elastic computing systems allow applications to scale up and down automatically, increasing capacity for workload spikes and ensuring cost savings during lulls in activity. Adapting database management systems to work on top of such elastic infrastructure is not a trivial task, and requires a deep understanding of the sophisticated interplay between data fragmentation, replica allocation, and cluster provisioning. This demonstration showcases NashDB, an end-to-end method for addressing these concerns in an automatic way. NashDB relies on economic models to maximize query performance while staying within a user's budget. This demonstration will (1) allow audience members to see how NashDB handles shifting workloads in an adaptive way, and (2) allow audience members to test NashDB themselves by constructing synthetic workloads and seeing how NashDB adapts a cluster to them in real time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sabek:2019:FAS, author = "Ibrahim Sabek and Mashaal Musleh and Mohamed F. Mokbel", title = "Flash in action: scalable spatial data analysis using {Markov} logic networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1834--1837", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352078", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The current explosion in spatial data raises the need for efficient spatial analysis tools to extract useful information from such data. However, existing tools are neither generic nor scalable when dealing with big spatial data. This demo presents Flash; a framework for generic and scalable spatial data analysis, with a special focus on spatial probabilistic graphical modelling (SPGM). Flash exploits Markov Logic Networks (MLN) to express SPGM as a set of declarative logical rules. In addition, it provides spatial variations of the scalable RDBMS-based learning and inference techniques of MLN to efficiently perform SPGM predictions. To show Flash effectiveness, we demonstrate three applications that use Flash in their SPGM: (1) Bird monitoring, (2) Safety analysis, and (3) Land use change tracking.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kuhring:2019:CBO, author = "Lucas Kuhring and Zsolt Istv{\'a}n", title = "{I} can't believe it's not (only) software!: bionic distributed storage for {Parquet} files", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1838--1841", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352079", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/python.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "There is a steady increase in the size of data stored and processed as part of data science applications, leading to bottlenecks and inefficiencies at various layers of the stack. One way of reducing such bottlenecks and increasing energy efficiency is by tailoring the underlying distributed storage solution to the application domain, using resources more efficiently. We explore this idea in the context of a popular column-oriented storage format used in big data workloads, namely Apache Parquet. Our prototype uses an FPGA-based storage node that offers high bandwidth data deduplication and a companion software library that exposes an API for Parquet file access. This way the storage node remains general purpose and could be shared by applications from different domains, while, at the same time, benefiting from deduplication well suited to Apache Parquet files and from selective reads of columns in the file. In this demonstration we show, on the one hand, that by relying on the FPGA's dataflow processing model, it is possible to implement in-line deduplication without increasing latencies significantly or reducing throughput. On the other hand, we highlight the benefits of implementing the application-specific aspects in a software library instead of FPGA circuits and how this enables, for instance, regular data science frameworks running in Python to access the data on the storage node and to offload filtering operations.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Choi:2019:VVI, author = "Hyewon Choi and Erkang Zhu and Arsala Bangash and Ren{\'e}e J. Miller", title = "{VISE}: vehicle image search engine with traffic camera", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1842--1845", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352080", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present VISE, or Vehicle Image Search Engine, to support the fast search of similar vehicles from low-resolution traffic camera images. VISE can be used to trace and locate vehicles for applications such as police investigations when high-resolution footage is not available. Our system consists of three components: an interactive user-interface for querying and browsing identified vehicles; a scalable search engine for fast similarity search on millions of visual objects; and an image processing pipeline that extracts feature vectors of objects from video frames. We use transfer learning technique to integrate state-of-the-art Convolutional Neural Networks with two different refinement methods to achieve high retrieval accuracy. We also use an efficient high-dimensional nearest neighbor search index to enable fast retrieval speed. In the demo, our system will offer users an interactive experience exploring a large database of traffic camera images that is growing in real time at 200K frames per day.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Goldberg:2019:WSF, author = "Stephan Goldberg and Tova Milo and Slava Novgorodov and Kathy Razmadze", title = "{WiClean}: a system for fixing {Wikipedia} interlinks using revision history patterns", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1846--1849", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352081", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present WiClean, a Wikipedia plug-in that supports the identification and cleaning of certain types of errors in Wikipedia interlinks. The system mines update patterns in Wikipedia revision logs, identifies the common time frames in which they occur, and employs them to signal incomplete/inconsistent updates and suggests corrections. We demonstrate the effectiveness of WiClean in identifying actual errors in a variety of Wikipedia entity types, interactively employing the VLDB'19 audience as editors to correct the identified errors.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Roy:2019:SHC, author = "Abhishek Roy and Alekh Jindal and Hiren Patel and Ashit Gosalia and Subru Krishnan and Carlo Curino", title = "{SparkCruise}: handsfree computation reuse in {Spark}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1850--1853", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352082", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Interactive data analytics is often inundated with common computations across multiple queries. These redundancies result in poor query performance and higher overall cost for the interactive query sessions. Obviously, reusing these common computations could lead to cost savings. However, it is difficult for the users to manually detect and reuse the common computations in their fast moving interactive sessions. In the paper, we propose to demonstrate SparkCruise, a computation reuse system that automatically selects the most useful common computations to materialize based on the past query workload. SparkCruise materializes these computations as part of query processing, so the users can continue with their query processing just as before and computation reuse is automatically applied in the background --- all without any modifications to the Spark code. We will invite the audience to play with several scenarios, such as workload redundancy insights and pay-as-you-go materialization, highlighting the utility of SparkCruise.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sandha:2019:DDM, author = "Sandeep Singh Sandha and Wellington Cabrera and Mohammed Al-Kateb and Sanjay Nair and Mani Srivastava", title = "In-database distributed machine learning: demonstration using {Teradata SQL} engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1854--1857", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352083", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/python.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Machine learning has enabled many interesting applications and is extensively being used in big data systems. The popular approach --- training machine learning models in frameworks like Tensorflow, Pytorch and Keras --- requires movement of data from database engines to analytical engines, which adds an excessive overhead on data scientists and becomes a performance bottleneck for model training. In this demonstration, we give a practical exhibition of a solution for the enablement of distributed machine learning natively inside database engines. During the demo, the audience will interactively use Python APIs in Jupyter Notebooks to train multiple linear regression models on synthetic regression datasets and neural network models on vision and sensory datasets directly inside Teradata SQL Engine.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2019:SLS, author = "Zhao Li and Xia Chen and Xuming Pan and Pengcheng Zou and Yuchen Li and Guoxian Yu", title = "{SHOAL}: large-scale hierarchical taxonomy via graph-based query coalition in e-commerce", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1858--1861", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352084", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "E-commerce taxonomy plays an essential role in online retail business. Existing taxonomy of e-commerce platforms organizes items into an ontology structure. However, the ontology-driven approach is subject to costly manual maintenance and often does not capture user's search intention, particularly when user searches by her personalized needs rather than a universal definition of the items. Observing that search queries can effectively express user's intention, we present a novel large-Scale Hierarchical taxOnomy via grAph based query coaLition ( SHOAL ) to bridge the gap between item taxonomy and user search intention. SHOAL organizes hundreds of millions of items into a hierarchical topic structure. Each topic that consists of a cluster of items denotes a conceptual shopping scenario, and is tagged with easy-to-interpret descriptions extracted from search queries. Furthermore, SHOAL establishes correlation between categories of ontology-driven taxonomy, and offers opportunities for explainable recommendation. The feedback from domain experts shows that SHOAL achieves a precision of 98\% in terms of placing items into the right topics, and the result of an online A/B test demonstrates that SHOAL boosts the Click Through Rate (CTR) by 5\%. SHOAL has been deployed in Alibaba and supports millions of searches for online shopping per day.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Xu:2019:DMD, author = "Min Xu and Tianhao Wang and Bolin Ding and Jingren Zhou and Cheng Hong and Zhicong Huang", title = "{DPSAaS}: multi-dimensional data sharing and analytics as services under local differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1862--1865", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352085", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Differential privacy has emerged as the de facto standard for privacy definitions, and been used by, e.g., Apple, Google, Uber, and Microsoft, to collect sensitive information about users and to build privacy-preserving analytics engines. However, most of such advanced privacy-protection techniques are not accessible to mid-size companies and app developers in the cloud. We demonstrate a lightweight middleware DPSAaS, which provides differentially private data-sharing-and-analytics functionality as cloud services. We focus on multi-dimensional analytical (MDA) queries under local differential privacy (LDP) in this demo. MDA queries against a fact table have predicates on (categorical or ordinal) dimensions and aggregate one or more measures. In the absence of a trusted agent, sensitive dimensions and measures are encoded in a privacy-preserving way locally using our LDP data sharing service, before being sent to the data collector. The data collector estimates the answers to MDA queries from the encoded data, using our data analytics service. We will highlight the design decisions of DPSAaS and twists made to LDA algorithms to fit the design, in order to smoothly connect DPSAaS to the data processing platform and analytics engines, and to facilitate efficient large-scale processing.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2019:PPS, author = "Yang Cao and Yonghui Xiao and Li Xiong and Liquan Bai and Masatoshi Yoshikawa", title = "{PriSTE}: protecting spatiotemporal event privacy in continuous location-based services", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1866--1869", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352086", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Location privacy-preserving mechanisms (LPPMs) have been extensively studied for protecting a user's location in location-based services. However, when user's perturbed locations are released continuously, existing LPPMs may not protect users' sensitive spatiotemporal event, such as ``visited hospital in the last week'' or ``regularly commuting between location 1 and location 2 every morning and afternoon'' (it is easy to infer that locations 1 and 2 may be home and office). In this demonstration, we demonstrate PriSTE for protecting spatiotemporal event privacy in continuous location release. First, to raise users' awareness of such a new privacy goal, we design an interactive tool to demonstrate how accurate an adversary could infer a secret spatiotemporal event from a sequence of locations or even LPPM-protected locations. The attendees can find that some spatiotemporal events are quite risky and even these state-of-the-art LPPMs do not always protect spatiotemporal event privacy. Second, we demonstrate how a user can use PriSTE to automatically or manually convert an LPPM for location privacy into one protecting spatiotemporal event privacy in continuous location-based services. Finally, we visualize the trade-off between privacy and utility so that users can choose appropriate privacy parameters in different application scenarios.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Deutch:2019:DOS, author = "Daniel Deutch and Evgeny Marants and Yuval Moskovitch", title = "{Datalignment}: ontology schema alignment through datalog containment", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1870--1873", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352087", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We focus on the problem of aligning ontology relations, namely finding relation names that correspond to the same or related concepts. Such alignment is a prerequisite to the integration of the multiple available Knowledge Bases many of which include similar concepts, differently termed. We propose a novel approach for this problem, by leveraging association rules --- originally mined in order to enrich the ontological content. Here, we treat the rules as Datalog programs and look for bounded-depth sub-programs that are contained in (or equivalent to) each other. Heads of such programs intuitively correspond to related concepts, and we propose them as candidates for alignment. The candidate alignments require further verification by experts; to this end we accompany each aligned pair with explanations based on the provenance of each relation according to its sub-program. We have implemented our novel solution in a system called Datalignment. We propose to demonstrate Datalignment, presenting the aligned pairs that it finds, and the computed explanations, in context of real-life Knowledge Bases.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ge:2019:IIH, author = "Congcong Ge and Yunjun Gao and Xiaoye Miao and Lu Chen and Christian S. Jensen and Ziyuan Zhu", title = "{IHCS}: an integrated hybrid cleaning system", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1874--1877", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352088", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data cleaning is a prerequisite to subsequent data analysis, and is know to often be time-consuming and labor-intensive. We present IHCS, a hybrid data cleaning system that integrates error detection and repair to contend effectively with multiple error types. In a preprocessing step that precedes the data cleaning, IHCS formats an input dataset to be cleaned, and transforms applicable data quality rules into a unified format. Then, an MLN index structure is formed according to the unified rules, enabling IHCS to handle multiple error types simultaneously. During the cleaning, IHCS first tackles abnormalities through an abnormal group process, and then, it generates multiple data versions based on the MLN index. Finally, IHCS eliminates conflicting values across the multiple versions, and derives the final unified clean data. A visual interface enables cleaning process monitoring and cleaning result analysis.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Costa:2019:CGB, author = "Constantinos Costa and Xiaoyu Ge and Panos K. Chrysanthis", title = "{CAPRIO}: graph-based integration of indoor and outdoor data for path discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1878--1881", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352089", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently, navigation and localization systems have emerged to support queries like the shortest distance in either indoor or outdoor with additional constraints. These systems, however, neither combine the indoor and outdoor information nor consider the external natural conditions like the weather that one may face across an outdoor path. In this demonstration paper we present CAPRIO, which proposes and implements a novel graph representation that integrates indoor and outdoor information to discover paths that personalize outdoor exposure while minimizes the overall path length. We also demonstrate how unifying the graph algorithms for indoor and outdoor navigation enables significant optimizations that would not be possible otherwise.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wu:2019:HAS, author = "Yingjun Wu and Jia Yu and Yuanyuan Tian and Richard Sidle and Ronald Barber", title = "{HERMIT} in action: succinct secondary indexing mechanism via correlation exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1882--1885", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352090", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database administrators construct secondary indexes on data tables to accelerate query processing in relational database management systems (RDBMSs). These indexes are built on top of the most frequently queried columns according to the data statistics. Unfortunately, maintaining multiple secondary indexes in the same database can be extremely space consuming, causing significant performance degradation due to the potential exhaustion of memory space. However, we find that there indeed exist many opportunities to save storage space by exploiting column correlations. We recently introduced Hermit, a succinct secondary indexing mechanism for modern RDBMSs. Hermit judiciously leverages the rich soft functional dependencies hidden among columns to prune out redundant structures for indexed key access. instead of building a complete index that stores every single entry in the key columns, Hermit navigates any incoming key access queries to an existing index built on the correlated columns. This is achieved through the Tiered Regression Search Tree (TRS-Tree), a succinct, ML-enhanced data structure that performs fast curve fitting to adaptively and dynamically capture both column correlations and outliers. In this demonstration, we showcase Hermit's appealing characteristics. we not only demonstrate that Hermit can significantly reduce space consumption with limited performance overhead in terms of query response time and index maintenance time, but also explain in detail the rationale behind Hermit's high efficiency using interactive online query processing examples.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Loudet:2019:DSH, author = "Julien Loudet and Iulian Sandu-Popa and Luc Bouganim", title = "{DISPERS}: securing highly distributed queries on personal data management systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1886--1889", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352091", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Personal Data Management Systems (PDMS) advance at a rapid pace allowing us to integrate all our personal data in a single place and use it for our benefit and for the benefit of the community. This leads to a significant paradigm shift since personal data become massively distributed and opens an important question: how to query this massively distributed data in an efficient, pertinent and privacy preserving way? This demonstration proposes a fully-distributed PDMS called DISPERS, built on top of SEP2P, allowing users to securely and efficiently share and query their personal data. The demonstration platform graphically illustrates the query execution in details, showing that DISPERS leads to maximal system security with low and scalable overhead. Attendees are welcome to challenge the security provided by DISPERS using the proposed hacking tools.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Akhter:2019:SFS, author = "Adil Akhter and Marios Fragkoulis and Asterios Katsifodimos", title = "Stateful functions as a service in action", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1890--1893", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352092", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the serverless model, users upload application code to a cloud platform and the cloud provider undertakes the deployment, execution and scaling of the application, relieving users from all operational aspects. Although very popular, current serverless offerings offer poor support for the management of local application state, the main reason being that managing state and keeping it consistent at large scale is very challenging. As a result, the serverless model is inadequate for executing stateful, latency-sensitive applications. In this paper we present a high-level programming model for developing stateful functions and deploying them in the cloud. Our programming model allows functions to retain state as well as call other functions. In order to deploy stateful functions in a cloud infrastructure, we translate functions and their data exchanges into a stateful dataflow graph. With this paper we aim at demonstrating that using a modified version of an open-source dataflow engine as a runtime for stateful functions, we can deploy scalable and stateful services in the cloud with surprisingly low latency and high throughput.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ordookhanians:2019:DKO, author = "Allen Ordookhanians and Xin Li and Supun Nakandala and Arun Kumar", title = "Demonstration of {Krypton}: optimized {CNN} inference for occlusion-based deep {CNN} explanations", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1894--1897", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352093", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demonstration, we present Krypton, a system for accelerating occlusion-based deep convolution neural network (CNN) explanation workloads. Driven by the success of CNNs in image understanding tasks, there is growing adoption of CNNs in various domains, including high stakes applications such as radiology. However, users of such applications often seek an ``explanation'' for why a CNN predicted a certain label. One of the most widely used approaches for explaining CNN predictions is the occlusion-based explanation (OBE) method. This approach is computationally expensive due to the large number of re-inference requests produced. Krypton reduces the runtime of OBE by up to 35x by enabling incremental and approximate inference optimizations that are inspired by classical database query optimization techniques. We allow the audience to interactively diagnose CNN predictions from several use cases, including radiology and natural images. A short video of our demonstration can be found here: https://youtu.be/1OWddbd4n6Y", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Miao:2019:LVE, author = "Zhengjie Miao and Andrew Lee and Sudeepa Roy", title = "{LensXPlain}: visualizing and explaining contributing subsets for aggregate query answers", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1898--1901", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352094", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demonstration, we will present LensXPlain, an interactive system to help users understand answers of aggregate queries by providing meaningful explanations. Given a SQL group-by query and a question from a user `` why output o is high/low '', or `` why output o$_1$ is higher/lower than o$_2$ '', LensXPlain helps users explore the results and find subsets of tuples captured by predicates that contributed the most toward such observations. The contributions are measured either by intervention (if the contributing tuples are removed, the values or the ratios in the user question change in the opposite direction), or by aggravation (if the query is restricted to the contributing tuples, the observations change more in the same direction). LensXPlain uses ensemble learning for recommending useful attributes in explanations, and employs a suite of optimizations to enable explanation generation and refinement at an interactive speed. In the demonstration, the audience can run aggregation queries over real world datasets, browse the answers using a graphical user interface, ask questions on unexpected/interesting query results with simple visualizations, and explore and refine explanations returned by LensXPlain.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2019:JDL, author = "Yi Zhang and Zachary G. Ives", title = "{Juneau}: data lake management for {Jupyter}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1902--1905", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352095", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In collaborative settings such as multi-investigator laboratories, data scientists need improved tools to manage not their data records but rather their data sets and data products, to facilitate both provenance tracking and data (and code) reuse within their data lakes and file systems. We demonstrate the Juneau System, which extends computational notebook software (Jupyter Notebook) as an instrumentation and data management point for overseeing and facilitating improved dataset usage, through capabilities for indexing, searching, and recommending ``complementary'' data sources, previously extracted machine learning features, and additional training data. This demonstration focuses on how we help the user find related datasets via search.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hasani:2019:AEA, author = "Sona Hasani and Faezeh Ghaderi and Shohedul Hasan and Saravanan Thirumuruganathan and Abolfazl Asudeh and Nick Koudas and Gautam Das", title = "{ApproxML}: efficient approximate ad-hoc {ML} models through materialization and reuse", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1906--1909", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352096", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Machine learning (ML) has gained a pivotal role in answering complex predictive analytic queries. Model building for large scale datasets is one of the time consuming parts of the data science pipeline. Often data scientists are willing to sacrifice some accuracy in order to speed up this process during the exploratory phase. In this paper, we propose to demonstrate ApproxML, a system that efficiently constructs approximate ML models for new queries from previously constructed ML models using the concepts of model materialization and reuse. ApproxML supports a variety of ML models such as generalized linear models for supervised learning, and K-means and Gaussian Mixture model for unsupervised learning.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Essertel:2019:FAL, author = "Gr{\'e}gory Essertel and Ruby Y. Tahboub and Fei Wang and James Decker and Tiark Rompf", title = "{Flare \& Lantern}: efficiently swapping horses midstream", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1910--1913", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352097", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Running machine learning (ML) workloads at scale is as much a data management problem as a model engineering problem. Big performance challenges exist when data management systems invoke ML classifiers as user-defined functions (UDFs) or when stand-alone ML frameworks interact with data stores for data loading and pre-processing (ETL). In particular, UDFs can be precompiled or simply a black box for the data management system and the data layout may be completely different from the native layout, thus adding overheads at the boundaries. In this demo, we will show how bottlenecks between existing systems can be eliminated when their engines are designed around runtime compilation and native code generation, which is the case for many state-of-the-art relational engines as well as ML frameworks. We demonstrate an integration of Flare (an accelerator for Spark SQL), and Lantern (an accelerator for TensorFlow and PyTorch) that results in a highly optimized end-to-end compiled data path, switching between SQL and ML processing with negligible overhead.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Martins:2019:TES, author = "Ruben Martins and Jia Chen and Yanju Chen and Yu Feng and Isil Dillig", title = "{Trinity}: an extensible synthesis framework for data science", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1914--1917", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352098", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demo paper, we introduce Trinity, a general-purpose framework that can be used to quickly build domain-specific program synthesizers for automating many tedious tasks that arise in data science. We illustrate how Trinity can be used by three different users: First, we show how end-users can use Trinity's built-in synthesizers to automate data wrangling tasks. Second, we show how advanced users can easily extend existing synthesizers to support additional functionalities. Third, we show how synthesis experts can change the underlying search engine in Trinity. Overall, this paper is intended to demonstrate how users can quickly use, modify, and extend the Trinity framework with the goal of automating many tasks that are considered to be the ``janitor'' work of data science.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2019:PAA, author = "Zhiqi Huang and Ryan McKenna and George Bissias and Gerome Miklau and Michael Hay and Ashwin Machanavajjhala", title = "{PSynDB}: accurate and accessible private data generation", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1918--1921", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352099", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Across many application domains, trusted parties who collect sensitive information need mechanisms to safely disseminate data. A favored approach is to generate synthetic data: a dataset similar to the original, hopefully retaining its statistical features, but one that does not reveal the private information of contributors to the data. We present PSynDB, a web-based synthetic table generator that is built on recent privacy technologies [10,11,15]. PSynDB satisfies the formal guarantee of differential privacy and generates synthetic tables with high accuracy for tasks that the user specifies as important. PSynDB allows users to browse expected error rates before running the mechanism, a useful feature for making important policy decisions, such as setting the privacy loss budget. When the user has finished configuration, the tool outputs a data synthesis program that can be ported to a trusted environment. There it can be safely executed on the private data to produce the private synthetic dataset for broad dissemination.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chandramouli:2019:FFI, author = "Badrish Chandramouli and Dong Xie and Yinan Li and Donald Kossmann", title = "{FishStore}: fast ingestion and indexing of raw data", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1922--1925", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352100", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The last decade has witnessed a huge increase in data being ingested into the cloud from a variety of data sources. The ingested data takes various forms such as JSON, CSV, and binary formats. Traditionally, data is either ingested into storage in raw form, indexed ad-hoc using range indices, or cooked into analytics-friendly columnar formats. None of these solutions is able to handle modern requirements on storage: making the data available immediately for ad-hoc and streaming queries while ingesting at extremely high throughputs. We demonstrate FishStore, our open-source concurrent latch-free storage layer for data with flexible schema. FishStore builds on recent advances in parsing and indexing techniques, and is based on multi-chain hash indexing of dynamically registered predicated subsets of data. We find predicated subset hashing to be a powerful primitive that supports a broad range of queries on ingested data and admits a higher performance (by up to an order of magnitude) implementation than current alternatives.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Diao:2019:SMF, author = "Yanlei Diao and Pawe{\l} Guzewicz and Ioana Manolescu and Mirjana Mazuran", title = "{Spade}: a modular framework for analytical exploration of {RDF} graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1926--1929", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352101", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "RDF data is complex; exploring it is hard, and can be done through many different metaphors. We have developed and propose to demonstrate Spade, a tool helping users discover meaningful content of an RDF graph by showing them the results of aggregation (OLAP-style) queries automatically identified from the data. Spade chooses aggregates that are visually interesting, a property formally based on statistic properties of the aggregation query results. While well understood for relational data, such exploration raises multiple challenges for RDF: facts, dimensions and measures have to be identified (as opposed to known beforehand); as there are more candidate aggregates, assessing their interestingness can be very costly; finally, ontologies bring novel specific challenges but also novel opportunities, enabling ontology-driven exploration from an aggregate initially proposed by the system. Spade is a generic, extensible framework, which we instantiated with: ( i ) novel methods for enumerating candidate measures and dimensions in the vast space of possibilities provided by an RDF graph; ( ii ) a set of aggregate interestingness functions; ( iii ) ontology-based interactive exploration, and ( iv ) efficient early-stop techniques for estimating the interestingness of an aggregate query. The demonstration will comprise interactive scenarios on a variety of large, interesting RDF graphs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dsilva:2019:MRD, author = "Joseph Vinish D'silva and Florestan {De Moor} and Bettina Kemme", title = "Making an {RDBMS} data scientist friendly: advanced in-database interactive analytics with visualization support", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1930--1933", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352102", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/python.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We are currently witnessing the rapid evolution and adoption of various data science frameworks that function external to the database. Any support from conventional RDBMS implementations for data science applications has been limited to procedural paradigms such as user-defined functions (UDFs) that lack exploratory programming support. Therefore, the current status quo is that during the exploratory phase, data scientists usually use the database system as the ``data storage'' layer of the data science framework, whereby the majority of computation and analysis is performed outside the database, e.g., at the client node. We demonstrate AIDA, an in-database framework for data scientists. AIDA allows users to write interactive Python code using a development environment such as a Jupyter notebook. The actual execution itself takes place inside the database (near-data), where a server component of AIDA, that resides inside the embedded Python interpreter of the RDBMS, manages the data sets and computations. The demonstration will also show the visualization capabilities of AIDA where the progress of computation can be observed through live updates. Our evaluations show that AIDA performs several times faster compared to contemporary external data science frameworks, but is much easier to use for exploratory development compared to database UDFs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zaouk:2019:UNG, author = "Khaled Zaouk and Fei Song and Chenghao Lyu and Arnab Sinha and Yanlei Diao and Prashant Shenoy", title = "{UDAO}: a next-generation unified data analytics optimizer", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1934--1937", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352103", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Big data analytics systems today still lack the ability to take user performance goals and budgetary constraints, collectively referred to as ``objectives'', and automatically configure an analytic job to achieve the objectives. This paper presents UDAO, a unified data analytics optimizer that can automatically determine the parameters of the runtime system, collectively called a job configuration, for general dataflow programs based on user objectives. UDAO embodies key techniques including in-situ modeling, which learns a model for each user objective in the same computing environment as the job is run, and multi-objective optimization, which computes a Pareto optimal set of job configurations to reveal tradeoffs between different objectives. Using benchmarks developed based on industry needs, our demonstration will allow the user to explore (1) learned models to gain insights into how various parameters affect user objectives; (2) Pareto frontiers to understand interesting tradeoffs between different objectives and how a configuration recommended by the optimizer explores these tradeoffs; (3) end-to-end benefits that UDAO can provide over default configurations or those manually tuned by engineers.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jo:2019:AFC, author = "Saehan Jo and Immanuel Trummer and Weicheng Yu and Xuezhi Wang and Cong Yu and Daniel Liu and Niyati Mehta", title = "{AggChecker}: a fact-checking system for text summaries of relational data sets", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1938--1941", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352104", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate AggChecker, a novel tool for verifying textual summaries of relational data sets. The system automatically verifies natural language claims about numerical aggregates against the underlying raw data. The system incorporates a combination of natural language processing, information retrieval, machine learning, and efficient query processing strategies. Each claim is translated into a semantically equivalent SQL query and evaluated against the database. Our primary goal is analogous to that of a spell-checker: to identify erroneous claims and provide guidance in correcting them. In this demonstration, we show that our system enables users to verify text summaries much more efficiently than a standard SQL interface.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2019:GIG, author = "Hanzhang Wang and Phuong Nguyen and Jun Li and Selcuk Kopru and Gene Zhang and Sanjeev Katariya and Sami Ben-Romdhane", title = "{GRANO}: interactive graph-based root cause analysis for cloud-native distributed data platform", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1942--1945", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352105", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate Grano$^1$, an end-to-end anomaly detection and root cause analysis (or RCA for short) system for cloud-native distributed data platform by providing a holistic view of the system component topology, alarms and application events. Grano provides: a Detection Layer to process large amount of time-series monitoring data to detect anomalies at logical and physical system components; an Anomaly Graph Layer with novel graph modeling and algorithms for leveraging system topology data and detection results to identify the root cause relevance at the system component level; and an Application Layer that automatically notifies on-call personnel and presents real-time and on-demand RCA support through an interactive graph interface. The system is deployed and evaluated using eBay's production data to help on-call personnel to shorten the identification of root cause from hours to minutes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Frey:2019:DHB, author = "Davide Frey and Marc X. Makkes and Pierre-Louis Roman and Fran{\c{c}}ois Ta{\"\i}ani and Spyros Voulgaris", title = "{Dietcoin}: hardening {Bitcoin} transaction verification process for mobile devices", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1946--1949", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352106", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed ledgers are among the most replicated data repositories in the world. They offer data consistency, immutability, and auditability, based on the assumption that each participating node locally verifies their entire content. Although their content, currently extending up to a few hundred gigabytes, can be accommodated by dedicated commodity hard disks, downloading it, processing it, and storing it in general-purpose desktop and laptop computers can prove largely impractical. Even worse, this becomes a prohibitive restriction for smartphones, mobile devices, and resource-constrained IoT devices. In this demo, we present an implementation of Dietcoin, a Bitcoin protocol extension that allows nodes to perform secure local verification of Bitcoin transactions with small bandwidth and storage requirements. This demo presents and benchmarks the main features of Dietcoin that are important for today's cryptocurrencies and smart contract systems, but are missing in the current state-of-the-art: (i) allowing resource-constrained devices to verify the correctness of selected blocks locally without having to download the complete ledger; (ii) enabling devices to join a blockchain quickly yet securely, dropping bootstrap time from days down to a matter of seconds; (iii) providing a generic solution that can be applied to other distributed ledgers secured with Proof-of-Work.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Singla:2019:RLS, author = "Samriddhi Singla and Ahmed Eldawy and Rami Alghamdi and Mohamed F. Mokbel", title = "{Raptor}: large scale analysis of big raster and vector data", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1950--1953", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352107", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the increase in amount of remote sensing data, there have been efforts to efficiently process it to help ecologists and geographers answer queries. However, they often need to process this data in combination with vector data, for example, city boundaries. Existing efforts require one dataset to be converted to the other representation, which is extremely inefficient for large datasets. In this demonstration, we focus on the zonal statistics problem, which computes the statistics over a raster layer for each polygon in a vector layer. We demonstrate three approaches, vector-based, raster-based, and raptor-based approaches. The latter is a recent effort of combining raster and vector data without a need of any conversion. This demo will allow users to run their own queries in any of the three methods and observe the differences in their performance depending on different raster and vector dataset sizes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rezig:2019:DCH, author = "El Kindi Rezig and Lei Cao and Michael Stonebraker and Giovanni Simonini and Wenbo Tao and Samuel Madden and Mourad Ouzzani and Nan Tang and Ahmed K. Elmagarmid", title = "{Data Civilizer 2.0}: a holistic framework for data preparation and analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1954--1957", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352108", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data scientists spend over 80\% of their time (1) parameter-tuning machine learning models and (2) iterating between data cleaning and machine learning model execution. While there are existing efforts to support the first requirement, there is currently no integrated workflow system that couples data cleaning and machine learning development. The previous version of Data Civilizer was geared towards data cleaning and discovery using a set of pre-defined tools. In this paper, we introduce Data Civilizer 2.0, an end-to-end workflow system satisfying both requirements. In addition, this system also supports a sophisticated data debugger and a workflow visualization system. In this demo, we will show how we used Data Civilizer 2.0 to help scientists at the Massachusetts General Hospital build their cleaning and machine learning pipeline on their 30TB brain activity dataset.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Spiegelberg:2019:TRE, author = "Leonhard F. Spiegelberg and Tim Kraska", title = "{Tuplex}: robust, efficient analytics when {Python} rules", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1958--1961", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352109", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/python.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Spark became the defacto industry standard as an execution engine for data preparation, cleaning, distributed machine learning, streaming and, warehousing over raw data. However, with the success of Python the landscape is shifting again; there is a strong demand for tools which better integrate with the Python landscape and do not have the impedance mismatch like Spark. In this paper, we demonstrate Tuplex (short for tuples and exceptions ), a Python-native data preparation framework that allows users to develop and deploy pipelines faster and more robustly while providing bare-metal execution times through code compilation whenever possible.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Renggli:2019:EMC, author = "Cedric Renggli and Frances Ann Hubis and Bojan Karlas and Kevin Schawinski and Wentao Wu and Ce Zhang", title = "{Ease.ml\slash ci} and {Ease.ml\slash meter} in action: towards data management for statistical generalization", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1962--1965", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352110", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Developing machine learning (ML) applications is similar to developing traditional software --- it is often an iterative process in which developers navigate within a rich space of requirements, design decisions, implementations, empirical quality, and performance. In traditional software development, software engineering is the field of study which provides principled guidelines for this iterative process. However, as of today, the counterpart of ``software engineering for ML'' is largely missing --- developers of ML applications are left with powerful tools (e.g., TensorFlow and PyTorch) but little guidance regarding the development lifecycle itself. In this paper, we view the management of ML development life-cycles from a data management perspective. We demonstrate two closely related systems, ease.ml/ci and ease.ml/meter, that provide some ``principled guidelines'' for ML application development: ci is a continuous integration engine for ML models and meter is a ``profiler'' for controlling overfitting of ML models. Both systems focus on managing the ``statistical generalization power'' of datasets used for assessing the quality of ML applications, namely, the validation set and the test set. By demonstrating these two systems we hope to spawn further discussions within our community on building this new type of data management systems for statistical generalization.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Han:2019:PRV, author = "Xueran Han and Jun Chen and Jiaheng Lu and Yueguo Chen and Xiaoyong Du", title = "{PivotE}: revealing and visualizing the underlying entity structures for exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1966--1969", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352111", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A Web-scale knowledge graph (KG) typically contains millions of entities and thousands of entity types. Due to the lack of a pre-defined data schema such as the ER model, entities in KGs are loosely coupled based on their relationships, which brings challenges for effective accesses of the KGs in a structured manner like SPARQL. This demonstration presents an entity-oriented exploratory search prototype system that is able to support search and explore KGs in a exploratory search manner, where local structures of KGs can be dynamically discovered and utilized for guiding users. The system applies a path-based ranking method for recommending similar entities and their relevant information as exploration pointers. The interface is designed to assist users to investigate a domain (particular type) of entities, as well as to explore the knowledge graphs in various relevant domains. The queries are dynamically formulated by tracing the users' dynamic clicking (exploration) behaviors. In this demonstration, we will show how our system visualize the underlying entity structures, as well as explain the semantic correlations among them in a unified interface, which not only assist users to learn about the properties of entities in many aspects but also guide them to further explore the information space.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lu:2019:SYA, author = "Jiaheng Lu and Yuxing Chen and Herodotos Herodotou and Shivnath Babu", title = "Speedup your analytics: automatic parameter tuning for databases and big data systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1970--1973", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352112", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database and big data analytics systems such as Hadoop and Spark have a large number of configuration parameters that control memory distribution, I/O optimization, parallelism, and compression. Improper parameter settings can cause significant performance degradation and stability issues. However, regular users and even expert administrators struggle to understand and tune them to achieve good performance. In this tutorial, we review existing approaches on automatic parameter tuning for databases, Hadoop, and Spark, which we classify into six categories: rule-based, cost modeling, simulation-based, experiment-driven, machine learning, and adaptive tuning. We describe the foundations of different automatic parameter tuning algorithms and present pros and cons of each approach. We also highlight real-world applications and systems, and identify research challenges for handling cloud services, resource heterogeneity, and real-time analytics.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Meng:2019:TAC, author = "Yu Meng and Jiaxin Huang and Jingbo Shang and Jiawei Han", title = "{TextCube}: automated construction and multidimensional exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1974--1977", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352113", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Today's society is immersed in a wealth of text data, ranging from news articles, to social media, research literature, medical records, and corporate reports. A grand challenge of data science and engineering is to develop effective and scalable methods to extract structures and knowledge from massive text data to satisfy diverse applications, without extensive, corpus-specific human annotations. In this tutorial, we show that TextCube provides a critical information organization structure that will satisfy such an information need. We overview a set of recently developed data-driven methods that facilitate automated construction of TextCubes from massive, domain-specific text corpora, and show that TextCubes so constructed will enhance text exploration and analysis for various applications. We focus on new TextCube construction methods that are scalable, weakly-supervised, domain-independent, language-agnostic, and effective (i.e., generating quality TextCubes from large corpora of various domains). We will demonstrate with real datasets (including news articles, scientific publications, and product reviews) on how TextCubes can be constructed to assist multidimensional analysis of massive text corpora.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Amer-Yahia:2019:EEO, author = "Sihem Amer-Yahia and Senjuti Basu Roy", title = "The ever evolving online labor market: overview, challenges and opportunities", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1978--1981", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352114", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The goal of this tutorial is to make the audience aware of various discipline-specific research activities that could be characterized to be part of online labor markets and advocate for a unified framework that is interdisciplinary in nature and requires convergence of different research disciplines. We will discuss how such a framework could bring transformative effect on the nexus of humans, technology, and the future of work.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sabek:2019:MLM, author = "Ibrahim Sabek and Mohamed F. Mokbel", title = "Machine learning meets big spatial data", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1982--1985", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352115", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The proliferation in amounts of generated data has propelled the rise of scalable machine learning solutions to efficiently analyze and extract useful insights from such data. Meanwhile, spatial data has become ubiquitous, e.g., GPS data, with increasingly sheer sizes in recent years. The applications of big spatial data span a wide spectrum of interests including tracking infectious disease, climate change simulation, drug addiction, among others. Consequently, major research efforts are exerted to support efficient analysis and intelligence inside these applications by either providing spatial extensions to existing machine learning solutions or building new solutions from scratch. In this 90-minutes tutorial, we comprehensively review the state-of-the-art work in the intersection of machine learning and big spatial data. We cover existing research efforts and challenges in three major areas of machine learning, namely, data analysis, deep learning and statistical inference, as well as two advanced spatial machine learning tasks, namely, spatial features extraction and spatial sampling. We also highlight open problems and challenges for future research in this area.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Nargesian:2019:DLM, author = "Fatemeh Nargesian and Erkang Zhu and Ren{\'e}e J. Miller and Ken Q. Pu and Patricia C. Arocena", title = "Data lake management: challenges and opportunities", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1986--1989", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352116", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The ubiquity of data lakes has created fascinating new challenges for data management research. In this tutorial, we review the state-of-the-art in data management for data lakes. We consider how data lakes are introducing new problems including dataset discovery and how they are changing the requirements for classic problems including data extraction, data cleaning, data integration, data versioning, and metadata management.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lakshmanan:2019:CFN, author = "Laks V. S. Lakshmanan and Michael Simpson and Saravanan Thirumuruganathan", title = "Combating fake news: a data management and mining perspective", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1990--1993", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352117", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Fake news is a major threat to global democracy resulting in diminished trust in government, journalism and civil society. The public popularity of social media and social networks has caused a contagion of fake news where conspiracy theories, disinformation and extreme views flourish. Detection and mitigation of fake news is one of the fundamental problems of our times and has attracted widespread attention. While fact checking websites such as snopes, politifact and major companies such as Google, Facebook, and Twitter have taken preliminary steps towards addressing fake news, much more remains to be done. As an interdisciplinary topic, various facets of fake news have been studied by communities as diverse as machine learning, databases, journalism, political science and many more. The objective of this tutorial is two-fold. First, we wish to familiarize the database community with the efforts by other communities on combating fake news. We provide a panoramic view of the state-of-the-art of research on various aspects including detection, propagation, mitigation, and intervention of fake news. Next, we provide a concise and intuitive summary of prior research by the database community and discuss how it could be used to counteract fake news. The tutorial covers research from areas such as data integration, truth discovery and fusion, probabilistic databases, knowledge graphs and crowdsourcing from the lens of fake news. Effective tools for addressing fake news could only be built by leveraging the synergistic relationship between database and other research communities. We hope that our tutorial provides an impetus towards such synthesis of ideas and the creation of new ones.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Anciaux:2019:PDS, author = "Nicolas Anciaux and Luc Bouganim and Philippe Pucheral and Iulian Sandu Popa and Guillaume Scerri", title = "Personal database security and trusted execution environments: a tutorial at the crossroads", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1994--1997", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352118", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Smart disclosure initiatives and new regulations such as GDPR in the EU increase the interest for Personal Data Management Systems (PDMS) being provided to individuals to preserve their entire digital life. Consequently, the thorny issue of data security becomes more and more prominent, but highly differs from traditional privacy issues in outsourced corporate databases. Concurrently, the emergence of Trusted Execution Environments (TEE) changes the game in privacy-preserving data management with novel security models. This tutorial offers a global perspective of the current state of work at the confluence of these two rapidly growing areas. The goal is threefold: (1) review and categorize PDMS solutions and identify existing privacy threats and countermeasures; (2) review new security models capitalizing on TEEs and related privacy-preserving data management solutions relevant to the personal context; (3) discuss new challenges at the intersection of PDMS security and TEE-based data management.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kessler:2019:SHG, author = "Stephan Kessler and Jens Hoff and Johann-Christoph Freytag", title = "{SAP HANA} goes private: from privacy research to privacy aware enterprise analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "1998--2009", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352119", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Over the last 20 years, the progress of information technology has allowed many companies to generate, integrate, store, and analyze data of unprecedented size and complexity. In many cases, this data is personal data and how it can be used is therefore subject to laws that depend on the specific countries and application domains. For example, the General Data Protection Regulation (GDPR) introduced in the European Union imposes strict rules on how personal data can be processed. Analyzing personal data can create tremendous value, but at the same time companies must ensure that they remain legally compliant. Unfortunately, existing systems offer only limited or no support at all for processing personal data in a privacy-aware manner. Approaches that have emerged from the academic and industrial research environments need to be integrated into large systems (like enterprise systems) in a manageable and scalable way. In many IT environments, it is also desirable and necessary to combine and to integrate personal data with other (non-personal) data in a seamless fashion. In this paper, we present the first steps that SAP has taken to provide its database management system SAP HANA with privacy-enhanced processing capabilities, referred to in the following as SAP HANA Data Anonymization. Various goals on both the conceptual and technical levels were followed with the aim of providing SAP customers today with an integrated processing environment for personal and non-personal data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Damasio:2019:GAL, author = "Guilherme Damasio and Vincent Corvinelli and Parke Godfrey and Piotr Mierzejewski and Alex Mihaylov and Jaroslaw Szlichta and Calisto Zuzarte", title = "Guided automated learning for query workload re-optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2010--2021", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352120", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query optimization is a hallmark of database systems. When an SQL query runs more expensively than is viable or warranted, determination of the performance issues is usually performed manually in consultation with experts through the analysis of query's execution plan (QEP). However, this is an excessively time consuming, human error-prone, and costly process. GALO is a novel system that automates this process. The tool automatically learns recurring problem patterns in query plans over workloads in an offline learning phase, to build a knowledge base of plan-rewrite remedies. It then uses the knowledge base online to re-optimize queries often quite drastically. GALO's knowledge base is built on RDF and SPARQL, W3C graph database standards, which is well suited for manipulating and querying over SQL query plans, which are graphs themselves. GALO acts as a third-tier of re-optimization, after query rewrite and cost-based optimization, as a query plan rewrite. For generality, the context of knowledge base problem patterns, including table and column names, is abstracted with canonical symbol labels. Since the knowledge base is not tied to the context of supplied QEPs, table and column names are matched automatically during the re-optimization phase. Thus, problem patterns learned over a particular query workload can be applied in other query workloads. GALO's knowledge base is also an invaluable tool for database experts to debug query performance issues by tracking to known issues and solutions as well as refining the optimizer with new tuned techniques by the development team. We demonstrate an experimental study of the effectiveness of our techniques over synthetic TPC-DS and real IBM client query workloads.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chattopadhyay:2019:PUS, author = "Biswapesh Chattopadhyay and Priyam Dutta and Weiran Liu and Ott Tinn and Andrew Mccormick and Aniket Mokashi and Paul Harvey and Hector Gonzalez and David Lomax and Sagar Mittal and Roee Ebenstein and Nikita Mikhaylin and Hung-ching Lee and Xiaoyan Zhao and Tony Xu and Luis Perez and Farhad Shahmohammadi and Tran Bui and Neil McKay and Selcuk Aya and Vera Lychagina and Brett Elliott", title = "{Procella}: unifying serving and analytical data at {YouTube}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2022--2034", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352121", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large organizations like YouTube are dealing with exploding data volume and increasing demand for data driven applications. Broadly, these can be categorized as: reporting and dashboarding, embedded statistics in pages, time-series monitoring, and ad-hoc analysis. Typically, organizations build specialized infrastructure for each of these use cases. This, however, creates silos of data and processing, and results in a complex, expensive, and harder to maintain infrastructure. At YouTube, we solved this problem by building a new SQL query engine --- Procella. Procella implements a superset of capabilities required to address all of the four use cases above, with high scale and performance, in a single product. Today, Procella serves hundreds of billions of queries per day across all four workloads at YouTube and several other Google product areas.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lu:2019:LET, author = "Wei Lu and Zhanhao Zhao and Xiaoyu Wang and Haixiang Li and Zhenmiao Zhang and Zhiyu Shui and Sheng Ye and Anqun Pan and Xiaoyong Du", title = "A lightweight and efficient temporal database management system in {TDSQL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2035--2046", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352122", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Driven by the recent adoption of temporal expressions into SQL:2011, extensions of temporal support in conventional database management systems (a.b.a. DBMSs) have re-emerged as a research hotspot. In this paper, we present a lightweight yet efficient built-in temporal implementation in Tencent's distributed database management system, namely TDSQL. The novelty of TDSQL's temporal implementation includes: (1) a new temporal data model with the extension of SQL:2011, (2) a built-in temporal implementation with various optimizations, which are also applicable to other DBMSs, and (3) a low-storage-consumption in which only data changes are maintained. For the repeatability purpose, we elaborate the integration of our proposed techniques into MySQL. We conduct extensive experiments on both real-life dataset and synthetic TPC benchmarks by comparing TD-SQL with other temporal databases. The results show that TDSQL is lightweight and efficient.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sherkat:2019:NSE, author = "Reza Sherkat and Colin Florendo and Mihnea Andrei and Rolando Blanco and Adrian Dragusanu and Amit Pathak and Pushkar Khadilkar and Neeraj Kulkarni and Christian Lemke and Sebastian Seifert and Sarika Iyer and Sasikanth Gottapu and Robert Schulze and Chaitanya Gottipati and Nirvik Basak and Yanhong Wang and Vivek Kandiyanallur and Santosh Pendap and Dheren Gala and Rajesh Almeida and Prasanta Ghosh", title = "Native store extension for {SAP HANA}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2047--2058", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352123", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present an overview of SAP HANA's Native Store Extension (NSE). This extension substantially increases database capacity, allowing to scale far beyond available system memory. NSE is based on a hybrid in-memory and paged column store architecture composed from data access primitives. These primitives enable the processing of hybrid columns using the same algorithms optimized for traditional HANA's in-memory columns. Using only three key primitives, we fabricated byte-compatible counterparts for complex memory resident data structures (e.g. dictionary and hash-index), compressed schemes (e.g. sparse and run-length encoding), and exotic data types (e.g. geo-spatial). We developed a new buffer cache which optimizes the management of paged resources by smart strategies sensitive to page type and access patterns. The buffer cache integrates with HANA's new execution engine that issues pipelined prefetch requests to improve disk access patterns. A novel load unit configuration, along with a unified persistence format, allows the hybrid column store to dynamically switch between in-memory and paged data access to balance performance and storage economy according to application demands while reducing Total Cost of Ownership (TCO). A new partitioning scheme supports load unit specification at table, partition, and column level. Finally, a new advisor recommends optimal load unit configurations. Our experiments illustrate the performance and memory footprint improvements on typical customer scenarios.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhan:2019:ART, author = "Chaoqun Zhan and Maomeng Su and Chuangxian Wei and Xiaoqiang Peng and Liang Lin and Sheng Wang and Zhe Chen and Feifei Li and Yue Pan and Fang Zheng and Chengliang Chai", title = "{AnalyticDB}: real-time {OLAP} database system at {Alibaba} cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2059--2070", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352124", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With data explosion in scale and variety, OLAP databases play an increasingly important role in serving real-time analysis with low latency (e.g., hundreds of milliseconds), especially when incoming queries are complex and ad hoc in nature. Moreover, these systems are expected to provide high query concurrency and write throughput, and support queries over structured and complex data types (e.g., JSON, vector and texts). In this paper, we introduce AnalyticDB, a real-time OLAP database system developed at Alibaba. AnalyticDB maintains all-column indexes in an asynchronous manner with acceptable overhead, which provides low latency for complex ad-hoc queries. Its storage engine extends hybrid row-column layout for fast retrieval of both structured data and data of complex types. To handle large-scale data with high query concurrency and write throughput, AnalyticDB decouples read and write access paths. To further reduce query latency, novel storage-aware SQL optimizer and execution engine are developed to fully utilize the advantages of the underlying storage and indexes. AnalyticDB has been successfully deployed on Alibaba Cloud to serve numerous customers (both large and small). It is capable of holding 100 trillion rows of records, i.e., 10PB+ in size. At the same time, it is able to serve 10m+ writes and 100k+ queries per second, while completing complex queries within hundreds of milliseconds.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Schultz:2019:TCM, author = "William Schultz and Tess Avitabile and Alyson Cabral", title = "Tunable consistency in {MongoDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2071--2081", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352125", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed databases offer high availability but can impose high costs on client applications in order to maintain strong consistency at all times. MongoDB is a document oriented database whose replication system provides a variety of consistency levels allowing client applications to select the trade-offs they want to make when it comes to consistency and latency, at a per operation level. In this paper we discuss the tunable consistency models in MongoDB replication and their utility for application developers. We discuss how the MongoDB replication system's speculative execution model and data rollback protocol help make this spectrum of consistency levels possible. We also present case studies of how these consistency levels are used in real world applications, along with a characterization of their performance benefits and trade-offs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2019:TOR, author = "Shaosheng Cao and XinXing Yang and Cen Chen and Jun Zhou and Xiaolong Li and Yuan Qi", title = "{TitAnt}: online real-time transaction fraud detection in {Ant Financial}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2082--2093", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352126", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the explosive growth of e-commerce and the booming of e-payment, detecting online transaction fraud in real time has become increasingly important to Fintech business. To tackle this problem, we introduce the TitAnt, a transaction fraud detection system deployed in Ant Financial, one of the largest Fintech companies in the world. The system is able to predict online real-time transaction fraud in mere milliseconds. We present the problem definition, feature extraction, detection methods, implementation and deployment of the system, as well as empirical effectiveness. Extensive experiments have been conducted on large real-world transaction data to show the effectiveness and the efficiency of the proposed system.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhu:2019:ACG, author = "Rong Zhu and Kun Zhao and Hongxia Yang and Wei Lin and Chang Zhou and Baole Ai and Yong Li and Jingren Zhou", title = "{AliGraph}: a comprehensive graph neural network platform", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2094--2105", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352127", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "An increasing number of machine learning tasks require dealing with large graph datasets, which capture rich and complex relationship among potentially billions of elements. Graph Neural Network (GNN) becomes an effective way to address the graph learning problem by converting the graph data into a low dimensional space while keeping both the structural and property information to the maximum extent and constructing a neural network for training and referencing. However, it is challenging to provide an efficient graph storage and computation capabilities to facilitate GNN training and enable development of new GNN algorithms. In this paper, we present a comprehensive graph neural network system, namely AliGraph, which consists of distributed graph storage, optimized sampling operators and runtime to efficiently support not only existing popular GNNs but also a series of in-house developed ones for different scenarios. The system is currently deployed at Alibaba to support a variety of business scenarios, including product recommendation and personalized search at Alibaba's E-Commerce platform. By conducting extensive experiments on a real-world dataset with 492.90 million vertices, 6.82 billion edges and rich attributes, AliGraph performs an order of magnitude faster in terms of graph building (5 minutes vs hours reported from the state-of-the-art PowerGraph platform). At training, AliGraph runs 40\%-50\% faster with the novel caching strategy and demonstrates around 12 times speed up with the improved runtime. In addition, our in-house developed GNN models all showcase their statistically significant superiorities in terms of both effectiveness and efficiency (e.g., 4.12\%--17.19\% lift by F1 scores).", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chen:2019:CSF, author = "Zhimin Chen and Yue Wang and Vivek Narasayya and Surajit Chaudhuri", title = "Customizable and scalable fuzzy join for big data", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2106--2117", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352128", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Fuzzy join is an important primitive for data cleaning. The ability to customize fuzzy join is crucial to allow applications to address domain-specific data quality issues such as synonyms and abbreviations. While efficient indexing techniques exist for single-node implementations of customizable fuzzy join, the state-of-the-art scale-out techniques do not support customization, and exhibit poor performance and scalability characteristics. We describe the design of a scale-out fuzzy join operator that supports customization. We use a locality-sensitive-hashing (LSH) based signature scheme, and introduce optimizations that result in significant speed up with negligible impact on recall. We evaluate our implementation on the Azure Databricks version of Spark using several real-world and synthetic data sets. We observe speedups exceeding 50X compared to the best-known prior scale-out technique, and close to linear scalability with data size and number of nodes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2019:QQA, author = "Guoliang Li and Xuanhe Zhou and Shifu Li and Bo Gao", title = "{QTune}: a query-aware database tuning system with deep reinforcement learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2118--2130", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352129", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database knob tuning is important to achieve high performance (e.g., high throughput and low latency). However, knob tuning is an NP-hard problem and existing methods have several limitations. First, DBAs cannot tune a lot of database instances on different environments (e.g., different database vendors). Second, traditional machine-learning methods either cannot find good configurations or rely on a lot of high-quality training examples which are rather hard to obtain. Third, they only support coarse-grained tuning (e.g., workload-level tuning) but cannot provide fine-grained tuning (e.g., query-level tuning). To address these problems, we propose a query-aware database tuning system QTune with a deep reinforcement learning (DRL) model, which can efficiently and effectively tune the database configurations. QTune first featurizes the SQL queries by considering rich features of the SQL queries. Then QTune feeds the query features into the DRL model to choose suitable configurations. We propose a Double-State Deep Deterministic Policy Gradient (DS-DDPG) model to enable query-aware database configuration tuning, which utilizes the actor-critic networks to tune the database configurations based on both the query vector and database states. QTune provides three database tuning granularities: query-level, workload-level, and cluster-level tuning. We deployed our techniques onto three real database systems, and experimental results show that QTune achieves high performance and outperforms the state-of-the-art tuning methods.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kandula:2019:EAQ, author = "Srikanth Kandula and Kukjin Lee and Surajit Chaudhuri and Marc Friedman", title = "Experiences with approximating queries in {Microsoft}'s production big-data clusters", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2131--2142", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352130", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the rapidly growing volume of data, it is more attractive than ever to leverage approximations to answer analytic queries. Sampling is a powerful technique which has been studied extensively from the point of view of facilitating approximation. Yet, there has been no large-scale study of effectiveness of sampling techniques in big data systems. In this paper, we describe an in-depth study of the sampling-based approximation techniques that we have deployed in Microsoft's big data clusters. We explain the choices we made to implement approximation, identify the usage cases, and study detailed data that sheds insight on the usefulness of doing sampling based approximation.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Antonopoulos:2019:CTR, author = "Panagiotis Antonopoulos and Peter Byrne and Wayne Chen and Cristian Diaconu and Raghavendra Thallam Kodandaramaih and Hanuma Kodavalla and Prashanth Purnananda and Adrian-Leonard Radu and Chaitanya Sreenivas Ravella and Girish Mittur Venkataramanappa", title = "Constant time recovery in {Azure SQL} database", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2143--2154", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352131", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Azure SQL Database and the upcoming release of SQL Server introduce a novel database recovery mechanism that combines traditional ARIES recovery with multi-version concurrency control to achieve database recovery in constant time, regardless of the size of user transactions. Additionally, our algorithm enables continuous transaction log truncation, even in the presence of long running transactions, thereby allowing large data modifications using only a small, constant amount of log space. These capabilities are particularly important for any Cloud database service given (a) the constantly increasing database sizes, (b) the frequent failures of commodity hardware, (c) the strict availability requirements of modern, global applications and (d) the fact that software upgrades and other maintenance tasks are managed by the Cloud platform, introducing unexpected failures for the users. This paper describes the design of our recovery algorithm and demonstrates how it allowed us to improve the availability of Azure SQL Database by guaranteeing consistent recovery times of under 3 minutes for 99.999\% of recovery cases in production.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2019:YGD, author = "Yuzhen Huang and Yingjie Shi and Zheng Zhong and Yihui Feng and James Cheng and Jiwei Li and Haochuan Fan and Chao Li and Tao Guan and Jingren Zhou", title = "{Yugong}: geo-distributed data and job placement at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2155--2169", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352132", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Companies like Alibaba operate tens of data centers (DCs) across geographically distributed locations. These DCs collectively provide the storage space and computing power for the company, storing EBs of data and serving millions of batch analytics jobs every day. In Alibaba, as our businesses grow, there are more and more cross-DC dependencies caused by jobs reading data from remote DCs. Consequently, the precious wide area network bandwidth becomes a major bottleneck for operating geo-distributed DCs at scale. In this paper, we present Yugong --- a system that manages data placement and job placement in Alibaba's geo-distributed DCs, with the objective to minimize cross-DC bandwidth usage. Yugong uses three methods, namely project placement, table replication, and job outsourcing, to address the issues of high bandwidth consumption across the DCs. We give the details of Yugong's design and implementation for the three methods, and describe how it cooperates with other systems (e.g., Alibaba's big data analytics platform and cluster scheduler) to improve the productivity of the DCs. We also report comprehensive performance evaluation results, which validate the design of Yugong and show that significant reduction in cross-DC bandwidth usage has been achieved.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tan:2019:CCD, author = "Junjay Tan and Thanaa Ghanem and Matthew Perron and Xiangyao Yu and Michael Stonebraker and David DeWitt and Marco Serafini and Ashraf Aboulnaga and Tim Kraska", title = "Choosing a cloud {DBMS}: architectures and tradeoffs", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2170--2182", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352133", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As analytic (OLAP) applications move to the cloud, DBMSs have shifted from employing a pure shared-nothing design with locally attached storage to a hybrid design that combines the use of shared-storage (e.g., AWS S3) with the use of shared-nothing query execution mechanisms. This paper sheds light on the resulting tradeoffs, which have not been properly identified in previous work. To this end, it evaluates the TPC-H benchmark across a variety of DBMS offerings running in a cloud environment (AWS) on fast 10Gb+ networks, specifically database-as-a-service offerings (Redshift, Athena), query engines (Presto, Hive), and a traditional cloud agnostic OLAP database (Vertica). While these comparisons cannot be apples-to-apples in all cases due to cloud configuration restrictions, we nonetheless identify patterns and design choices that are advantageous. These include prioritizing low-cost object stores like S3 for data storage, using system agnostic yet still performant columnar formats like ORC that allow easy switching to other systems for different workloads, and making features that benefit subsequent runs like query precompilation and caching remote data to faster storage optional rather than required because they disadvantage ad hoc queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2019:SSM, author = "Jingtian Zhang and Sai Wu and Zeyuan Tan and Gang Chen and Zhushi Cheng and Wei Cao and Yusong Gao and Xiaojie Feng", title = "{S3}: a scalable in-memory skip-list index for key--value store", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2183--2194", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352134", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many new memory indexing structures have been proposed and outperform current in-memory skip-list index adopted by LevelDB, RocksDB and other key--value systems. However, those new indexes cannot be easily integrated with key--value systems, because most of them do not consider how the data can be efficiently flushed to disk. Some assumptions, such as fixed size key and value, are unrealistic for real applications. In this paper, we present S3, a scalable in-memory skip-list index for the customized version of RocksDB in Alibaba Cloud. S3 adopts a two-layer structure. In the top layer, a cache-sensitive structure is used to maintain a few guard entries to facilitate the search over the skip-list. In the bottom layer, a semi-ordered skip-list index is built to support highly concurrent insertions and fast lookup and range query. To further improve the performance, we train a neural model to select guard entries intelligently according to the data distribution and query distribution. Experiments on multiple datasets show that S3 achieves a comparable performance to other new memory indexing schemes, and can replace current in-memory skip-list of LevelDB and RocksDB to support huge volume of data.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Masson:2019:DFF, author = "Charles Masson and Jee E. Rim and Homin K. Lee", title = "{DDSketch}: a fast and fully-mergeable quantile sketch with relative-error guarantees", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2195--2205", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352135", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Summary statistics such as the mean and variance are easily maintained for large, distributed data streams, but order statistics (i.e., sample quantiles) can only be approximately summarized. There is extensive literature on maintaining quantile sketches where the emphasis has been on bounding the rank error of the sketch while using little memory. Unfortunately, rank error guarantees do not preclude arbitrarily large relative errors, and this often occurs in practice when the data is heavily skewed. Given the distributed nature of contemporary large-scale systems, another crucial property for quantile sketches is mergeablility, i.e., several combined sketches must be as accurate as a single sketch of the same data. We present the first fully-mergeable, relative-error quantile sketching algorithm with formal guarantees. The sketch is extremely fast and accurate, and is currently being used by Datadog at a wide-scale.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Long:2019:DSL, author = "Qiang Long and Wei Wang and Jinfu Deng and Song Liu and Wenhao Huang and Fangying Chen and Sifan Liu", title = "A distributed system for large-scale $n$-gram language models at {Tencent}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2206--2217", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352136", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "n-gram language models are widely used in language processing applications, e.g., automatic speech recognition, for ranking the candidate word sequences generated from the generator model, e.g., the acoustic model. Large n-gram models typically give good ranking results; however, they require a huge amount of memory storage. While distributing the model across multiple nodes resolves the memory issue, it nonetheless incurs a great network communication overhead and introduces a different bottleneck. In this paper, we present our distributed system developed at Tencent with novel optimization techniques for reducing the network overhead, including distributed indexing, batching and caching. They reduce the network requests and accelerate the operation on each single node. We also propose a cascade fault-tolerance mechanism which adaptively switches to small n-gram models depending on the severity of the failure. Experimental study on 9 automatic speech recognition (ASR) datasets confirms that our distributed system scales to large models efficiently, effectively and robustly. We have successfully deployed it for Tencent's WeChat ASR with the peak network traffic at the scale of 100 millions of messages per minute.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Dursun:2019:MDQ, author = "Kayhan Dursun and Carsten Binnig and Ugur Cetintemel and Garret Swart and Weiwei Gong", title = "A morsel-driven query execution engine for heterogeneous multi-cores", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2218--2229", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352137", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Currently, we face the next major shift in processor designs that arose from the physical limitations known as the ``dark silicon effect''. Due to thermal limitations and shrinking transistor sizes, multi-core scaling is coming to an end. A major new direction that hardware vendors are currently investigating involves specialized and energy-efficient hardware accelerators (e.g., ASICs) placed on the same die as the normal CPU cores. In this paper, we present a novel query processing engine called SiliconDB that targets such heterogeneous processor environments. We leverage the Sparc M7 platform to develop and test our ideas. Based on the SSB benchmarks, as well as other micro benchmarks, we compare the efficiency of SiliconDB with existing execution strategies that make use of co-processors (e.g., FPGAs, GPUs) and demonstrate speed-up improvements of up to 2x.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cao:2019:SSS, author = "Lei Cao and Wenbo Tao and Sungtae An and Jing Jin and Yizhou Yan and Xiaoyu Liu and Wendong Ge and Adam Sah and Leilani Battle and Jimeng Sun and Remco Chang and Brandon Westover and Samuel Madden and Michael Stonebraker", title = "{Smile}: a system to support machine learning on {EEG} data at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2230--2241", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352138", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In order to reduce the possibility of neural injury from seizures and sidestep the need for a neurologist to spend hours on manually reviewing the EEG recording, it is critical to automatically detect and classify ``interictal-ictal continuum'' (IIC) patterns from EEG data. However, the existing IIC classification techniques are shown to be not accurate and robust enough for clinical use because of the lack of high quality labels of EEG segments as training data. Obtaining high-quality labeled data is traditionally a manual process by trained clinicians that can be tedious, time-consuming, and error-prone. In this work, we propose Smile, an industrial scale system that provides an end-to-end solution to the IIC pattern classification problem. The core components of Smile include a visualization-based time series labeling module and a deep-learning based active learning module. The labeling module enables the users to explore and label 350 million EEG segments (30TB) at interactive speed. The multiple coordinated views allow the users to examine the EEG signals from both time domain and frequency domain simultaneously. The active learning module first trains a deep neural network that automatically extracts both the local features with respect to each segment itself and the long term dynamics of the EEG signals to classify IIC patterns. Then leveraging the output of the deep learning model, the EEG segments that can best improve the model are selected and prompted to clinicians to label. This process is iterated until the clinicians and the models show high degree of agreement. Our initial experimental results show that our Smile system allows the clinicians to label the EEG segments at will with a response time below 500 ms. The accuracy of the model is progressively improved as more and more high quality labels are acquired over time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Green:2019:UGD, author = "Alastair Green and Paolo Guagliardo and Leonid Libkin and Tobias Lindaaker and Victor Marsault and Stefan Plantikow and Martin Schuster and Petra Selmer and Hannes Voigt", title = "Updating graph databases with {Cypher}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2242--2254", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352139", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The paper describes the present and the future of graph updates in Cypher, the language of the Neo4j property graph database and several other products. Update features include those with clear analogs in relational databases, as well as those that do not correspond to any relational operators. Moreover, unlike SQL, Cypher updates can be arbitrarily intertwined with querying clauses. After presenting the current state of update features, we point out their shortcomings, most notably violations of atomicity and non-deterministic behavior of updates. These have not been previously known in the Cypher community. We then describe the industry-academia collaboration on designing a revised set of Cypher update operations. Based on discovered shortcomings of update features, a number of possible solutions were devised. They were presented to key Cypher users, who were given the opportunity to comment on how update features are used in real life, and on their preferences for proposed fixes. As the result of the consultation, a new set of update operations for Cypher were designed. Those led to a streamlined syntax, and eliminated the unexpected and problematic behavior that original Cypher updates exhibited.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kamsky:2019:ATC, author = "Asya Kamsky", title = "Adapting {TPC-C} benchmark to measure performance of multi-document transactions in {MongoDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2254--2262", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352140", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "MongoDB is a popular distributed database that supports replication, horizontal partitioning (sharding), a flexible document schema and ACID guarantees on the document level. While it is generally grouped with ``NoSQL'' databases, MongoDB provides many features similar to those of traditional RDBMS such as secondary indexes, an ad hoc query language, support for complex aggregations, and new as of version 4.0 multi-statement, multi-document ACID transactions. We looked for a well understood OLTP workload benchmark to use in our own system performance test suite to establish a baseline of transaction performance to enable flagging performance regressions, as well as improvements as we continue to add new functionality. While there exist many published and widely used benchmarks for RDBMS OLTP workloads, there are none specifically for document databases. This paper describes the process of adapting an existing traditional RDBMS benchmark to MongoDB query language and transaction semantics to allow measuring transaction performance. We chose to adapt the TPC-C benchmark even though it assumes a relational database schema and SQL, hence extensive changes had to be made to stay consistent with MongoDB best practices. Our goal did not include creating official TPC-C certifiable results, however, every attempt was made to stay consistent with the spirit of the original benchmark specification as well as to be compliant to all specification requirements where possible. We discovered that following best practices for document schema design achieves better performance than using required normalized schema. All the source code used and validation scripts are published in github to allow the reader to recreate and verify our results.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2019:CND, author = "Feifei Li", title = "Cloud-native database systems at {Alibaba}: opportunities and challenges", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2263--2272", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352141", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cloud-native databases become increasingly important for the era of cloud computing, due to the needs for elasticity and on-demand usage by various applications. These challenges from cloud applications present new opportunities for cloud-native databases that cannot be fully addressed by traditional on-premise enterprise database systems. A cloud-native database leverages software-hardware co-design to explore accelerations offered by new hardware such as RDMA, NVM, kernel bypassing protocols such as DPDK. Meanwhile, new design architectures, such as shared storage, enable a cloud-native database to decouple computation from storage and provide excellent elasticity. For highly concurrent workloads that require horizontal scalability, a cloud-native database can leverage a shared-nothing layer to provide distributed query and transaction processing. Applications also require cloud-native databases to offer high availability through distributed consensus protocols. At Alibaba, we have explored a suite of technologies to design cloud-native database systems. Our storage engine, X-Engine and PolarFS, improves both write and read throughputs by using a LSM-tree design and self-adapted separation of hot and cold data records. Based on these efforts, we have designed and implemented POLARDB and its distributed version POLARDB-X, which has successfully supported the extreme transaction workloads during the 2018 Global Shopping Festival on November 11, 2018, and achieved commercial success on Alibaba Cloud. We have also designed an OLAP system called AnalyticDB (ADB in short) for enabling real-time interactive data analytics for big data. We have explored a self-driving database platform to achieve autoscaling and intelligent database management. We will report key technologies and lessons learned to highlight the technical challenges and opportunities for cloud-native database systems at Alibaba.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Boehm:2019:MME, author = "Alexander Boehm", title = "In-memory for the masses: enabling cost-efficient deployments of in-memory data management platforms for business applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2273--2275", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352142", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With unrivaled performance, modern in-memory data management platforms such as SAP HANA [5] enable the creation of novel types of business applications. By keeping all data in memory, applications may combine both demanding transactional as well as complex analytical workloads in the context of a single system. While this excellent performance, data freshness, and flexibility gain is highly desirable in a vast range of modern business applications [6], the corresponding large appetite for main memory has significant implications on server sizing. Particularly, hardware costs on premise as well as in the cloud are at risk to increase significantly, driven by the high amount of DRAM that needs to be provisioned potentially. In this talk, we discuss a variety of challenges and opportunities that arise when running business applications in a cost-efficient manner on in-memory database systems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hubail:2019:CAN, author = "Murtadha {Al Hubail} and Ali Alsuliman and Michael Blow and Michael Carey and Dmitry Lychagin and Ian Maxon and Till Westmann", title = "Couchbase analytics: {NoETL} for scalable {NoSQL} data analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2275--2286", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352143", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Couchbase Server is a highly scalable document-oriented database management system. With a shared-nothing architecture, it exposes a fast key--value store with a managed cache for sub-millisecond data operations, indexing for fast queries, and a powerful query engine for executing declarative SQL-like queries. Its Query Service debuted several years ago and supports high volumes of low-latency queries and updates for JSON documents. Its recently introduced Analytics Service complements the Query Service. Couchbase Analytics, the focus of this paper, supports complex analytical queries (e.g., ad hoc joins and aggregations) over large collections of JSON documents. This paper describes the Analytics Service from the outside in, including its user model, its SQL++ based query language, and its MPP-based storage and query processing architecture. It also briefly touches on the relationship of Couchbase Analytics to Apache AsterixDB, the open source Big Data management system at the core of Couchbase Analytics.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Colyer:2019:PS, author = "Adrian Colyer", title = "Performance in the spotlight", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2287--2289", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352144", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Performance in its various guises features prominently in research evaluations, and rightly so. Without adequate performance a system is not fit for purpose. That doesn't necessarily mean we should pursue performance at all costs though. In this talk we'll explore a variety of additional evaluation criteria, with a focus on those that are most important to practitioners, and ask whether or not considering them can open up interesting avenues of research.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Abouzied:2019:ILS, author = "Azza Abouzied and Daniel J. Abadi and Kamil Bajda-Pawlikowski and Avi Silberschatz", title = "Integration of large-scale data processing systems and traditional parallel database technology", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2290--2299", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352145", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In 2009 we explored the feasibility of building a hybrid SQL data analysis system that takes the best features from two competing technologies: large-scale data processing systems (such as Google MapReduce and Apache Hadoop) and parallel database management systems (such as Greenplum and Vertica). We built a prototype, HadoopDB, and demonstrated that it can deliver the high SQL query performance and efficiency of parallel database management systems while still providing the scalability, fault tolerance, and flexibility of large-scale data processing systems. Subsequently, HadoopDB grew into a commercial product, Hadapt, whose technology was eventually acquired by Teradata. In this paper, we provide an overview of HadoopDB's original design, and its evolution during the subsequent ten years of research and development effort. We describe how the project innovated both in the research lab, and as a commercial product at Hadapt and Teradata. We then discuss the current vibrant ecosystem of software projects (most of which are open source) that continued HadoopDB's legacy of implementing a systems level integration of large-scale data processing systems and parallel database technology.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cooper:2019:PSL, author = "Brian F. Cooper and P. P. S. Narayan and Raghu Ramakrishnan and Utkarsh Srivastava and Adam Silberstein and Philip Bohannon and Hans-Arno Jacobsen and Nick Puz and Daniel Weaver and Ramana Yerneni", title = "{PNUTS} to {Sherpa}: lessons from {Yahoo!}'s cloud database", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2300--2307", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352146", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we look back at the evolution of Yahoo!'s geo-replicated cloud data store from a research project called PNUTS to a globally deployed production system called Sherpa, share some of the lessons learned along the way, and finally, compare PNUTS with current operational cloud stores.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Tan:2019:WPD, author = "Wang-Chiew Tan", title = "What {I} probably did right and what {I} think {I} could have done better", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2308--2308", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352147", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "They say a lot of good things in life are not free. Success is one of them. Successful research requires an immense amount of hard work and dedication over a long period of time. For better or worse, hard work alone does not guarantee success. In my experience, success is a marathon of hard work and some luck along the way. What is often forgotten is that it is important to enjoy the journey of hard work and appreciate many experiences and relationships along the way. I am deeply honored to receive the 2019 VLDB Women in Database Research Award. In the talk, I will share with you a retrospective of my journey so far, what I probably did right along the way, and perhaps more importantly, the many things I think I could have done better as a computer scientist and especially a female computer scientist.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Parameswaran:2019:EDS, author = "Aditya Parameswaran", title = "Enabling data science for the majority", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2309--2322", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352148", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Despite great strides in the generation, collection, and processing of data at scale, data science is still extremely inconvenient for the vast majority of the population. The driving goal of our research, over the past half decade, has been to make it easy for individuals and teams---regardless of programming or analysis expertise---manage, analyze, make sense of, and draw insights from large datasets. In this article, we reflect on a comprehensive suite of tools that we've been building to empower everyone to perform data science more efficiently and effortlessly, including DataSpread, a scalable spreadsheet tool that combines the benefits of spreadsheets and databases, and ZenVisage, a visual exploration tool that accelerates the discovery of trends or patterns. Our tools have been developed in collaboration with experts in various disciplines, including neuroscience, battery science, genomics, astrophysics, and ad analytics. We will discuss some of the key technical challenges underlying the development of these tools, and how we addressed them, drawing from ideas in multiple disciplines. in the process, we will outline a research agenda for tool development to empower everyone to tap into the hidden potential in their datasets at scale.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Rekatsinas:2019:ODM, author = "Theodoras Rekatsinas and Sudeepa Roy and Manasi Vartak and Ce Zhang and Neoklis Polyzotis", title = "Opportunities for data management research in the era of horizontal {AI\slash ML}", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "12", pages = "2323--2323", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3352063.3352149", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:02 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "AI/ML is becoming a horizontal technology: its application is expanding to more domains, and its integration touches more parts of the technology stack. Given the strong dependence of ML on data, this expansion creates a new space for applying data management techniques. At the same time, the deeper integration of ML in the technology stack provides more touch points where ML can be used in data management systems and vice versa. In this panel, we invite researchers working in this domain to discuss this emerging world and its implications on data-management research. Among other topics, the discussion will touch on the opportunities for interesting research, how we can interact with other communities, what is the core expertise we bring to the table, and how we can conduct and evaluate this research effectively within our own community. The goal of the panel is to nudge the community to appreciate the opportunities in this new world of horizontal AI/ML and to spur a discussion on how we can shape an effective research agenda.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Barthels:2019:SCH, author = "Claude Barthels and Ingo M{\"u}ller and Konstantin Taranov and Gustavo Alonso and Torsten Hoefler", title = "Strong consistency is not hard to get: two-phase locking and two-phase commit on thousands of cores", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "13", pages = "2325--2338", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3358701.3358702", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 26 07:21:38 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Concurrency control is a cornerstone of distributed database engines and storage systems. In pursuit of scalability, a common assumption is that Two-Phase Locking (2PL) and Two-Phase Commit (2PC) are not viable solutions due to their communication overhead. Recent results, however, have hinted that 2PL and 2PC might not have such a bad performance. Nevertheless, there has been no attempt to actually measure how a state-of-the-art implementation of 2PL and 2PC would perform on modern hardware. The goal of this paper is to establish a baseline for concurrency control mechanisms on thousands of cores connected through a low-latency network. We develop a distributed lock table supporting all the standard locking modes used in database engines. We focus on strong consistency in the form of strict serializability implemented through strict 2PL, but also explore read-committed and repeatable-read, two common isolation levels used in many systems. We do not leverage any known optimizations in the locking or commit parts of the protocols. The surprising result is that, for TPC-C, 2PL and 2PC can be made to scale to thousands of cores and hundreds of machines, reaching a throughput of over 21 million transactions per second with 9.5 million New Order operations per second. Since most existing relational database engines use some form of locking for implementing concurrency control, our findings provide a path for such systems to scale without having to significantly redesign transaction management. To achieve these results, our implementation relies on Remote Direct Memory Access (RDMA). Today, this technology is commonly available on both Infiniband as well as Ethernet networks, making the results valid across a wide range of systems and platforms, including database appliances, data centers, and cloud environments.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wei:2019:DRE, author = "Ziheng Wei and Uwe Leck and Sebastian Link", title = "Discovery and ranking of embedded uniqueness constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "13", pages = "2339--2352", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3358701.3358703", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 26 07:21:38 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data profiling is an enabler for efficient data management and effective analytics. The discovery of data dependencies is at the core of data profiling. We conduct the first study on the discovery of embedded uniqueness constraints (eUCs). These constraints represents unique column combinations embedded in complete fragments of incomplete data. We showcase their implementation as filtered indexes, and their application in integrity management and query optimization. We show that the decision variant of discovering a minimal eUC is NP-complete and W[2]-complete. We characterize the maximum possible solution size, and show which families of eUCs attain that size. Despite the challenges, experiments with real-world and synthetic benchmark data show that our column(row)-efficient algorithms perform well with a large number of columns(rows), and our hybrid algorithm combines ideas from both. We show how to rank eUCs to help identify relevant eUCs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chu:2019:ODB, author = "Lingyang Chu and Yanyan Zhang and Yu Yang and Lanjun Wang and Jian Pei", title = "Online density bursting subgraph detection from temporal graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "13", pages = "2353--2365", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3358701.3358704", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 26 07:21:38 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a temporal weighted graph that consists of a potentially endless stream of updates, we are interested in finding density bursting subgraphs (DBS for short), where a DBS is a subgraph that accumulates its density at the fastest speed. Online DBS detection enjoys many novel applications. At the same time, it is challenging since the time duration of a DBS can be arbitrarily long but a limited size storage can buffer only up to a certain number of updates. To tackle this problem, we observe the critical decomposability of DBSs and show that a DBS with a long time duration can be decomposed into a set of indecomposable DBSs with equal or larger burstiness. We further prove that the time duration of an indecomposable DBS is upper bounded and propose an efficient method TopkDBSOL to detect indecomposable DBSs in an online manner. Extensive experiments demonstrate the effectiveness, efficiency and scalability of TopkDBSOL in detecting significant DBSs from temporal graphs in real applications.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Holanda:2019:PII, author = "Pedro Holanda and Mark Raasveldt and Stefan Manegold and Hannes M{\"u}hleisen", title = "Progressive indexes: indexing for interactive data analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "13", pages = "2366--2378", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3358701.3358705", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 26 07:21:38 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Interactive exploration of large volumes of data is increasingly common, as data scientists attempt to extract interesting information from large opaque data sets. This scenario presents a difficult challenge for traditional database systems, as (1) nothing is known about the query workload in advance, (2) the query workload is constantly changing, and (3) the system must provide interactive responses to the issued queries. This environment is challenging for index creation, as traditional database indexes require upfront creation, hence a priori workload knowledge, to be efficient. In this paper, we introduce Progressive Indexing, a novel performance-driven indexing technique that focuses on automatic index creation while providing interactive response times to incoming queries. Its design allows queries to have a limited budget to spend on index creation. The indexing budget is automatically tuned to each query before query processing. This allows for systems to provide interactive answers to queries during index creation while being robust against various workload patterns and data distributions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Hanai:2019:DEP, author = "Masatoshi Hanai and Toyotaro Suzumura and Wen Jun Tan and Elvis Liu and Georgios Theodoropoulos and Wentong Cai", title = "Distributed edge partitioning for trillion-edge graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "13", pages = "2379--2392", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3358701.3358706", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 26 07:21:38 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose Distributed Neighbor Expansion (Distributed NE), a parallel and distributed graph partitioning method that can scale to trillion-edge graphs while providing high partitioning quality. Distributed NE is based on a new heuristic, called parallel expansion, where each partition is constructed in parallel by greedily expanding its edge set from a single vertex in such a way that the increase of the vertex cuts becomes local minimal. We theoretically prove that the proposed method has the upper bound in the partitioning quality. The empirical evaluation with various graphs shows that the proposed method produces higher-quality partitions than the state-of-the-art distributed graph partitioning algorithms. The performance evaluation shows that the space efficiency of the proposed method is an order-of-magnitude better than the existing algorithms, keeping its time efficiency comparable. As a result, Distributed NE can partition a trillion-edge graph using only 256 machines within 70 minutes.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Athanassoulis:2019:OCL, author = "Manos Athanassoulis and Kenneth S. B{\o}gh and Stratos Idreos", title = "Optimal column layout for hybrid workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "13", pages = "2393--2407", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3358701.3358707", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 26 07:21:38 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data-intensive analytical applications need to support both efficient reads and writes. However, what is usually a good data layout for an update-heavy workload, is not well-suited for a read-mostly one and vice versa. Modern analytical data systems rely on columnar layouts and employ delta stores to inject new data and updates. We show that for hybrid workloads we can achieve close to one order of magnitude better performance by tailoring the column layout design to the data and query workload. Our approach navigates the possible design space of the physical layout: it organizes each column's data by determining the number of partitions, their corresponding sizes and ranges, and the amount of buffer space and how it is allocated. We frame these design decisions as an optimization problem that, given workload knowledge and performance requirements, provides an optimal physical layout for the workload at hand. To evaluate this work, we build an in-memory storage engine, Casper, and we show that it outperforms state-of-the-art data layouts of analytical systems for hybrid workloads. Casper delivers up to 2.32x higher throughput for update-intensive workloads and up to 2.14x higher throughput for hybrid workloads. We further show how to make data layout decisions robust to workload variation by carefully selecting the input of the optimization.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sintos:2019:SDC, author = "Stavros Sintos and Pankaj K. Agarwal and Jun Yang", title = "Selecting data to clean for fact checking: minimizing uncertainty vs. maximizing surprise", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "13", pages = "2408--2421", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3358701.3358708", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Nov 26 07:21:38 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the optimization problem of selecting numerical quantities to clean in order to fact-check claims based on such data. Oftentimes, such claims are technically correct, but they can still mislead for two reasons. First, data may contain uncertainty and errors. Second, data can be ``fished'' to advance particular positions. In practice, fact-checkers cannot afford to clean all data and must choose to clean what ``matters the most'' to checking a claim. We explore alternative definitions of what ``matters the most'': one is to ascertain claim qualities (by minimizing uncertainty in these measures), while an alternative is just to counter the claim (by maximizing the probability of finding a counterargument). We show whether the two objectives align with each other, with important implications on when fact-checkers should exercise care in selective data cleaning, to avoid potential bias introduced by their desire to counter claims. We develop efficient algorithms for solving the various variants of the optimization problem, showing significant improvements over naive solutions. The problem is particularly challenging because the objectives in the fact-checking context are complex, non-linear functions over data. We obtain results that generalize to a large class of functions, with potential applications beyond fact-checking.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Chawla:2019:RMQ, author = "Shuchi Chawla and Shaleen Deep and Paraschos Koutrisw and Yifeng Teng", title = "Revenue maximization for query pricing", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "1", pages = "1--14", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3357377.3357378", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:03 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Buying and selling of data online has increased substantially over the last few years. Several frameworks have already been proposed that study query pricing in theory and practice. The key guiding principle in these works is the notion of arbitrage-freeness where the broker can set different prices for different queries made to the dataset, but must ensure that the pricing function does not provide the buyers with opportunities for arbitrage. However, little is known about revenue maximization aspect of query pricing. In this paper, we study the problem faced by a broker selling access to data with the goal of maximizing her revenue. We show that this problem can be formulated as a revenue maximization problem with single-minded buyers and unlimited supply, for which several approximation algorithms are known. We perform an extensive empirical evaluation of the performance of several pricing algorithms for the query pricing problem on real-world instances. In addition to previously known approximation algorithms, we propose several new heuristics and analyze them both theoretically and experimentally. Our experiments show that algorithms with the best theoretical bounds are not necessarily the best empirically. We identify algorithms and heuristics that are both fast and also provide consistently good performance when valuations are drawn from a wide variety of distributions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shi:2019:RTP, author = "Jieming Shi and Renchi Yang and Tianyuan Jin and Xiaokui Xiao and Yin Yang", title = "Realtime top-$k$ {Personalized PageRank} over large graphs on {GPUs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "1", pages = "15--28", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3357377.3357379", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:03 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a graph G, a source node s \in G and a positive integer k, a top- k Personalized PageRank (PPR) query returns the k nodes with the highest PPR values with respect to s, where the PPR of a node v measures its relevance from the perspective of source s. Top- k PPR processing is a fundamental task in many important applications such as web search, social networks, and graph analytics. This paper aims to answer such a query in realtime, i.e., within less than 100ms, on an Internet-scale graph with billions of edges. This is far beyond the current state of the art, due to the immense computational cost of processing a PPR query. We achieve this goal with a novel algorithm kPAR, which utilizes the massive parallel processing power of GPUs. The main challenge in designing a GPU-based PPR algorithm lies in that a GPU is mainly a parallel computation device, whereas PPR processing involves graph traversals and value propagation operations, which are inherently sequential and memory-bound. Existing scalable PPR algorithms are mostly described as single-thread CPU solutions that are resistant to parallelization. Further, they usually involve complex data structures which do not have efficient adaptations on GPUs. kPAR overcomes these problems via both novel algorithmic designs (namely, adaptive forward push and inverted random walks ) and system engineering (e.g., load balancing) to realize the potential of GPUs. Meanwhile, kPAR provides rigorous guarantees on both result quality and worst-case efficiency. Extensive experiments show that kPAR is usually 10x faster than parallel adaptations of existing methods. Notably, on a billion-edge Twitter graph, kPAR answers a top-1000 PPR query in 42.4 milliseconds.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2019:FLS, author = "Sheng Wang and Zhifeng Bao and J. Shane Culpepper and Timos Sellis and Xiaolin Qin", title = "Fast large-scale trajectory clustering", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "1", pages = "29--42", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3357377.3357380", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:03 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we study the problem of large-scale trajectory data clustering, k -paths, which aims to efficiently identify k ``representative'' paths in a road network. Unlike traditional clustering approaches that require multiple data-dependent hyperparameters, k -paths can be used for visual exploration in applications such as traffic monitoring, public transit planning, and site selection. By combining map matching with an efficient intermediate representation of trajectories and a novel edge-based distance (EBD) measure, we present a scalable clustering method to solve k -paths. Experiments verify that we can cluster millions of taxi trajectories in less than one minute, achieving improvements of up to two orders of magnitude over state-of-the-art solutions that solve similar trajectory clustering problems.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Herodotou:2019:ADT, author = "Herodotos Herodotou and Elena Kakoulli", title = "Automating distributed tiered storage management in cluster computing", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "1", pages = "43--56", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3357377.3357381", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:03 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data-intensive platforms such as Hadoop and Spark are routinely used to process massive amounts of data residing on distributed file systems like HDFS. Increasing memory sizes and new hardware technologies (e.g., NVRAM, SSDs) have recently led to the introduction of storage tiering in such settings. However, users are now burdened with the additional complexity of managing the multiple storage tiers and the data residing on them while trying to optimize their workloads. In this paper, we develop a general framework for automatically moving data across the available storage tiers in distributed file systems. Moreover, we employ machine learning for tracking and predicting file access patterns, which we use to decide when and which data to move up or down the storage tiers for increasing system performance. Our approach uses incremental learning to dynamically refine the models with new file accesses, allowing them to naturally adjust and adapt to workload changes over time. Our extensive evaluation using realistic workloads derived from Facebook and CMU traces compares our approach with several other policies and showcases significant benefits in terms of both workload performance and cluster efficiency.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Jung:2019:AAD, author = "Jinho Jung and Hong Hu and Joy Arulraj and Taesoo Kim and Woonhak Kang", title = "{APOLLO}: automatic detection and diagnosis of performance regressions in database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "1", pages = "57--70", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3357377.3357382", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:03 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The practical art of constructing database management systems (DBMSs) involves a morass of trade-offs among query execution speed, query optimization speed, standards compliance, feature parity, modularity, portability, and other goals. It is no surprise that DBMSs, like all complex software systems, contain bugs that can adversely affect their performance. The performance of DBMSs is an important metric as it determines how quickly an application can take in new information and use it to make new decisions. Both developers and users face challenges while dealing with performance regression bugs. First, developers usually find it challenging to manually design test cases to uncover performance regressions since DBMS components tend to have complex interactions. Second, users encountering performance regressions are often unable to report them, as the regression-triggering queries could be complex and database-dependent. Third, developers have to expend a lot of effort on localizing the root cause of the reported bugs, due to the system complexity and software development complexity. Given these challenges, this paper presents the design of Apollo, a toolchain for automatically detecting, reporting, and diagnosing performance regressions in DBMSs. We demonstrate that Apollo automates the generation of regression-triggering queries, simplifies the bug reporting process for users, and enables developers to quickly pinpoint the root cause of performance regressions. By automating the detection and diagnosis of performance regressions, Apollo reduces the labor cost of developing efficient DBMSs.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Owaida:2019:LLD, author = "Muhsen Owaida and Gustavo Alonso and Laura Fogliarini and Anthony Hock-Koon and Pierre-Etienne Melet", title = "Lowering the latency of data processing pipelines through {FPGA} based hardware acceleration", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "1", pages = "71--85", month = sep, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3357377.3357383", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 2 06:49:03 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Web search engines often involve a complex pipeline of processing stages including computing, scoring, and ranking potential answers plus returning the sorted results. The latency of such pipelines can be improved by minimizing data movement, making stages faster, and merging stages. The throughput is determined by the stage with the smallest capacity and it can be improved by allocating enough parallel resources to each stage. In this paper we explore the possibility of employing hardware acceleration (an FPGA) as a way to improve the overall performance when computing answers to search queries. With a real use case as a baseline and motivation, we focus on accelerating the scoring function implemented as a decision tree ensemble, a common approach to scoring and classification in search systems. Our solution uses a novel decision tree ensemble implementation on an FPGA to: (1) increase the number of entries that can be scored per unit of time, and (2) provide a compact implementation that can be combined with previous stages. The resulting system, tested in Amazon F1 instances, significantly improves the quality of the search results and improves performance by two orders of magnitude over the existing CPU based solution.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Cai:2019:MSS, author = "Shaofeng Cai and Gang Chen and Beng Chin Ooi and Jinyang Gao", title = "Model slicing for supporting complex analytics with elastic inference cost and resource constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "2", pages = "86--99", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3364324.3364325", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:12 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Deep learning models have been used to support analytics beyond simple aggregation, where deeper and wider models have been shown to yield great results. These models consume a huge amount of memory and computational operations. However, most of the large-scale industrial applications are often computational budget constrained. In practice, the peak workload of inference service could be 10x higher than the average cases, with the presence of unpredictable extreme cases. Lots of computational resources could be wasted during off-peak hours and the system may crash when the workload exceeds system capacity. How to support deep learning services with dynamic workload cost-efficiently remains a challenging problem. In this paper, we address the challenge with a general and novel training scheme called model slicing, which enables deep learning models to provide predictions within the prescribed computational resource budget dynamically. Model slicing could be viewed as an elastic computation solution without requiring more computational resources. Succinctly, each layer in the model is divided into groups of contiguous block of basic components (i.e. neurons in dense layers and channels in convolutional layers), and then partially ordered relation is introduced to these groups by enforcing that groups participated in each forward pass always starts from the first group to the dynamically-determined rightmost group. Trained by dynamically indexing the rightmost group with a single parameter slice rate, the network is engendered to build up group-wise and residual representation. Then during inference, a sub-model with fewer groups can be readily deployed for efficiency whose computation is roughly quadratic to the width controlled by the slice rate. Extensive experiments show that models trained with model slicing can effectively support on-demand workload with elastic inference cost.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Herlihy:2019:CCD, author = "Maurice Herlihy and Barbara Liskov and Liuba Shrira", title = "Cross-chain deals and adversarial commerce", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "2", pages = "100--113", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3364324.3364326", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:12 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern distributed data management systems face a new challenge: how can autonomous, mutually-distrusting parties cooperate safely and effectively? Addressing this challenge brings up questions familiar from classical distributed systems: how to combine multiple steps into a single atomic action, how to recover from failures, and how to synchronize concurrent access to data. Nevertheless, each of these issues requires rethinking when participants are autonomous and potentially adversarial. We propose the notion of a cross-chain deal, a new way to structure complex distributed computations that manage assets in an adversarial setting. Deals are inspired by classical atomic transactions, but are necessarily different, in important ways, to accommodate the decentralized and untrusting nature of the exchange. We describe novel safety and liveness properties, along with two alternative protocols for implementing cross-chain deals in a system of independent blockchain ledgers. One protocol, based on synchronous communication, is fully decentralized, while the other, based on semi-synchronous communication, requires a globally shared ledger.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zois:2019:EMM, author = "Vasileios Zois and Vassilis J. Tsotras and Walid A. Najjar", title = "Efficient main-memory top-$k$ selection for multicore architectures", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "2", pages = "114--127", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3364324.3364327", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:12 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Efficient Top-$k$ query evaluation relies on practices that utilize auxiliary data structures to enable early termination. Such techniques were designed to trade-off complex work in the buffer pool against costly access to disk-resident data. Parallel in-memory Top-$k$ selection with support for early termination presents a novel challenge because computation shifts higher up in the memory hierarchy. In this environment, data scan methods using SIMD instructions and multithreading perform well despite requiring evaluation of the complete dataset. Early termination schemes that favor simplicity require random access to resolve score ambiguity while those optimized for sequential access incur too many object evaluations. In this work, we introduce the concept of rank uncertainty, a measure of work efficiency that enables classifying existing solutions according to their potential for efficient parallel in-memory Top-fc selection. We identify data reordering and layering strategies as those having the highest potential and provide practical guidelines on how to adapt them for parallel in-memory execution (creating the VTA and SLA approaches). In addition, we show that the number of object evaluations can be further decreased by combining data reordering with angle space partitioning (introducing PTA). Our extensive experimental evaluation on varying query parameters using both synthetic and real data, showcase that PTA exhibits between 2 and 4 orders of magnitude better query latency, and throughput when compared to prior work and our optimized algorithmic variants (i.e. VTA, SLA).", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Bottcher:2019:SGC, author = "Jan B{\"o}ttcher and Viktor Leis and Thomas Neumann and Alfons Kemper", title = "Scalable garbage collection for in-memory {MVCC} systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "2", pages = "128--141", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3364324.3364328", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:12 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To support Hybrid Transaction and Analytical Processing (HTAP), database systems generally rely on Multi-Version Concurrency Control (MVCC). While MVCC elegantly enables lightweight isolation of readers and writers, it also generates outdated tuple versions, which, eventually, have to be reclaimed. Surprisingly, we have found that in HTAP workloads, this reclamation of old versions, i.e., garbage collection, often becomes the performance bottleneck. It turns out that in the presence of long-running queries, state-of-the-art garbage collectors are too coarse-grained. As a consequence, the number of versions grows quickly slowing down the entire system. Moreover, the standard background cleaning approach makes the system vulnerable to sudden spikes in workloads. In this work, we propose a novel garbage collection (GC) approach that prunes obsolete versions eagerly. Its seamless integration into the transaction processing keeps the GC overhead minimal and ensures good scalability. We show that our approach handles mixed workloads well and also speeds up pure OLTP workloads like TPC-C compared to existing state-of-the-art approaches.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2019:FDD, author = "Bohua Yang and Dong Wen and Lu Qin and Ying Zhang and Xubo Wang and Xuemin Lin", title = "Fully dynamic depth-first search in directed graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "2", pages = "142--154", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3364324.3364329", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:12 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Depth-first search (DFS) is a fundamental and important algorithm in graph analysis. It is the basis of many graph algorithms such as computing strongly connected components, testing planarity, and detecting biconnected components. The result of a DFS is normally shown as a DFS-Tree. Given the frequent updates in many real-world graphs (e.g., social networks and communication networks), we study the problem of DFS-Tree maintenance in dynamic directed graphs. In the literature, most works focus on the DFS-Tree maintenance problem in undirected graphs and directed acyclic graphs. However, their methods cannot easily be applied in the case of general directed graphs. Motivated by this, we propose a framework and corresponding algorithms for both edge insertion and deletion in general directed graphs. We further give several optimizations to speed up the algorithms. We conduct extensive experiments on 12 real-world datasets to show the efficiency of our proposed algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ma:2019:LMC, author = "Chenhao Ma and Reynold Cheng and Laks V. S. Lakshmanan and Tobias Grubenmann and Yixiang Fang and Xiaodong Li", title = "{LINC}: a motif counting algorithm for uncertain graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "2", pages = "155--168", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3364324.3364330", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:12 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In graph applications (e.g., biological and social networks), various analytics tasks (e.g., clustering and community search) are carried out to extract insight from large and complex graphs. Central to these tasks is the counting of the number of motifs, which are graphs with a few nodes. Recently, researchers have developed several fast motif counting algorithms. Most of these solutions assume that graphs are deterministic, i.e., the graph edges are certain to exist. However, due to measurement and statistical prediction errors, this assumption may not hold, and hence the analysis quality can be affected. To address this issue, we examine how to count motifs on uncertain graphs, whose edges only exist probabilistically. Particularly, we propose a solution framework that can be used by existing deterministic motif counting algorithms. We further propose an approximation algorithm. Extensive experiments on real datasets show that our algorithms are more effective and efficient than existing solutions.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Eskandarian:2019:OOQ, author = "Saba Eskandarian and Matei Zaharia", title = "{ObliDB}: oblivious query processing for secure databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "2", pages = "169--183", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3364324.3364331", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:12 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Hardware enclaves such as Intel SGX are a promising technology for improving the security of databases outsourced to the cloud. These enclaves provide an execution environment isolated from the hyper-visor/OS, and encrypt data in RAM. However, for applications that use large amounts of memory, including most databases, enclaves do not protect against access pattern leaks, which let attackers gain a large amount of information about the data. Moreover, the na{\"\i}ve way to address this issue, using Oblivious RAM (ORAM) primitives from the security literature, adds substantial overhead. A number of recent works explore trusted hardware enclaves as a path toward secure, access-pattern oblivious outsourcing of data storage and analysis. While these works efficiently solve specific subproblems (e.g. building secure indexes or running analytics queries that always scan entire tables), no prior work has supported oblivious query processing for general query workloads on a DBMS engine with multiple access methods. Moreover, applying these techniques individually does not guarantee that an end-to-end workload, such as a complex SQL query over multiple tables, will be oblivious. In this paper, we introduce ObliDB, an oblivious database engine design that is the first system to provide obliviousness for general database read workloads over multiple access methods. ObliDB introduces a diverse array of new oblivious physical operators to accelerate oblivious SQL queries, giving speedups of up to an order of magnitude over na{\"\i}ve ORAM. It supports a broad range of queries, including aggregation, joins, insertions, deletions and point queries. We implement ObliDB and show that, on analytics workloads, ObliDB ranges from 1.1--19x faster than Opaque, a previous oblivious, enclave-based system designed only for analytics, and comes within 2.6 x of Spark SQL, which provides no security guarantees. In addition, ObliDB supports point queries with 3--10ms latency, which is comparable to index-only trusted hardware systems, and runs over 7x faster than HIRB, a previous encryption-based oblivious index system that supports point queries.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ge:2019:SMP, author = "Chang Ge and Ihab F. Ilyas and Florian Kerschbaum", title = "Secure multi-party functional dependency discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "2", pages = "184--196", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3364324.3364332", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:12 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data profiling is an important task to understand data semantics and is an essential pre-processing step in many tools. Due to privacy constraints, data is often partitioned into silos, with different access control. Discovering functional dependencies (FDs) usually requires access to all data partitions to find constraints that hold on the whole dataset. Simply applying general secure multi-party computation protocols incurs high computation and communication cost. This paper formulates the FD discovery problem in the secure multi-party scenario. We propose secure constructions for validating candidate FDs, and present efficient cryptographic protocols to discover FDs over distributed partitions. Experimental results show that solution is practically efficient over non-secure distributed FD discovery, and can significantly outperform general purpose multi-party computation frameworks. To the best of our knowledge, our work is the first one to tackle this problem.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Wang:2019:VFM, author = "Minmei Wang and Mingxun Zhou and Shouqian Shi and Chen Qian", title = "Vacuum filters: more space-efficient and faster replacement for {Bloom} and cuckoo filters", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "2", pages = "197--210", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3364324.3364333", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:12 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present vacuum filters, a type of data structures to support approximate membership queries. Vacuum filters cost the smallest space among all known AMQ data structures and provide higher insertion and lookup throughput in most situations. Hence they can be used as the replacement of the widely used Bloom filters and cuckoo filters. Similar to cuckoo filters, vacuum filters also store item fingerprints in a table. The memory-efficiency and throughput improvements are from the innovation of a table insertion and fingerprint eviction strategy that achieves both high load factor and data locality without any restriction of the table size. In addition, we propose a new update framework to resolve two difficult problems for AMQ structures under dynamics, namely duplicate insertions and set resizing. The experiments show that vacuum filters can achieve 25\% less space in average and similar throughput compared to cuckoo filters, and 15\% less space and $ > 10 \times $ throughput compared to Bloom filters, with same false positive rates. AMQ data structures are widely used in various layers of computer systems and networks and are usually hosted in platforms where memory is limited and precious. Hence the improvements brought by vacuum filters can be considered significant.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sun:2019:SES, author = "Yihan Sun and Guy E. Blelloch and Wan Shen Lim and Andrew Pavlo", title = "On supporting efficient snapshot isolation for hybrid workloads with multi-versioned indexes", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "2", pages = "211--225", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3364324.3364334", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:12 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern data-driven applications require that databases support fast analytical queries while undergoing rapid updates---often referred to as Hybrid Transactional Analytical Processing (HTAP). Achieving fast queries and updates in a database management system (DBMS) is challenging since optimizations to improve analytical queries can cause overhead for updates. One solution is to use snapshot isolation (SI) for multi-version concurrency control (MVCC) to allow readers to make progress regardless of concurrent writers. In this paper, we propose the Parallel Binary Tree (P-Tree) index structure to achieve SI and MVCC for multicore in-memory HTAP DBMSs. At their core, P-Trees are based on pure (immutable) data structures that use path-copying for updates for fast multi-versioning. They support tree nesting to improve OLAP performance while still allowing for efficient updates. The data structure also enables parallel algorithms for bulk operations on indexes and their underlying tables. We evaluate P-Trees on OLTP and OLAP benchmarks, and compare them with state-of-the-art data structures and DBMSs. Our experiments show that P-Trees outperform many concurrent data structures for the YCSB workload, and is 4--9 x faster than existing DBMSs for analytical queries, while also achieving reasonable throughput for simultaneous transactional updates.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Fang:2019:IMV, author = "Zhuhe Fang and Beilei Zheng and Chuliang Weng", title = "Interleaved multi-vectorizing", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "3", pages = "226--238", month = nov, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3368289.3368290", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:13 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "SIMD is an instruction set in mainstream processors, which provides the data level parallelism to accelerate the performance of applications. However, its advantages diminish when applications suffer from heavy cache misses. To eliminate cache misses in SIMD vectorization, we present interleaved multi-vectorizing (IMV) in this paper. It interleaves multiple execution instances of vectorized code to hide memory access latency with more computation. We also propose residual vectorized states to solve the control flow divergence in vectorization. IMV can make full use of the data parallelism in SIMD and the memory level parallelism through prefetching. It reduces cache misses, branch misses and computation overhead to significantly speed up the performance of pointer-chasing applications, and it can be applied to executing entire query pipelines. As experimental results show, IMV achieves up to 4.23X and 3.17X better performance compared with the pure scalar implementation and the pure SIMD vectorization, respectively.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Shetiya:2019:UOA, author = "Suraj Shetiya and Abolfazl Asudeh and Sadia Ahmed and Gautam Das", title = "A unified optimization algorithm for solving {``regret-minimizing representative''} problems", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "3", pages = "239--251", month = nov, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3368289.3368291", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:13 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a database with numeric attributes, it is often of interest to rank the tuples according to linear scoring functions. For a scoring function and a subset of tuples, the regret of the subset is defined as the (relative) difference in scores between the top-1 tuple of the subset and the top-1 tuple of the entire database. Finding the regret-ratio minimizing set (RRMS), i.e., the subset of a required size k that minimizes the maximum regret-ratio across all possible ranking functions, has been a well-studied problem in recent years. This problem is known to be NP-complete and there are several approximation algorithms for it. Other NP-complete variants have also been investigated, e.g., finding the set of size k that minimizes the average regret ratio over all linear functions. Prior work have designed customized algorithms for different variants of the problem, and are unlikely to easily generalize to other variants. In this paper we take a different path towards tackling these problems. In contrast to the prior, we propose a unified algorithm for solving different problem variants. Unification is done by localizing the customization to the design of variant-specific subroutines or ``oracles'' that are called by our algorithm. Our unified algorithm takes inspiration from the seemingly unrelated problem of clustering from data mining, and the corresponding k-medoid algorithm. We make several innovative contributions in designing our algorithm, including various techniques such as linear programming, edge sampling in graphs, volume estimation of multi-dimensional convex polytopes, and several others. We provide rigorous theoretical analysis, as well as substantial experimental evaluations over real and synthetic data sets to demonstrate the practical feasibility of our approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kandula:2019:PDI, author = "Srikanth Kandula and Laurel Orr and Surajit Chaudhuri", title = "Pushing data-induced predicates through joins in big-data clusters", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "3", pages = "252--265", month = nov, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3368289.3368292", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:13 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Using data statistics, we convert predicates on a table into data induced predicates (diPs) that apply on the joining tables. Doing so substantially speeds up multi-relation queries because the benefits of predicate pushdown can now apply beyond just the tables that have predicates. We use diPs to skip data exclusively during query optimization; i.e., diPs lead to better plans and have no overhead during query execution. We study how to apply diPs for complex query expressions and how the usefulness of diPs varies with the data statistics used to construct diPs and the data distributions. Our results show that building diPs using zone-maps which are already maintained in today's clusters leads to sizable data skipping gains. Using a new (slightly larger) statistic, 50\% of the queries in the TPC-H, TPC-DS and JoinOrder benchmarks can skip at least 33\% of the query input. Consequently, the median query in a production big-data cluster finishes roughly 2x faster.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Pena:2019:DAE, author = "Eduardo H. M. Pena and Eduardo C. de Almeida and Felix Naumann", title = "Discovery of approximate (and exact) denial constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "3", pages = "266--278", month = nov, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3368289.3368293", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:13 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Maintaining data consistency is known to be hard. Recent approaches have relied on integrity constraints to deal with the problem --- correct and complete constraints naturally work towards data consistency. State-of-the-art data cleaning frameworks have used the formalism known as denial constraint (DC) to handle a wide range of real-world constraints. Each DC expresses a relationship between predicates that indicate which combinations of attribute values are inconsistent. The design of DCs, however, must keep pace with the complexity of data and applications. The alternative to designing DCs by hand is automatically discovering DCs from data, which is computationally expensive due to the large search space of DCs. To tackle this challenging task, we present a novel algorithm to efficiently discover DCs: DCFinder. The algorithm combines data structures called position list indexes with techniques based on predicate selectivity to efficiently validate DC candidates. Because the available data often contain errors, DCFinder is especially designed to discovering approximate DCs, i.e., DCs that may partially hold. Our experimental evaluation uses real and synthetic datasets and shows that DCFinder outperforms all the existing approximate DC discovery algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Yang:2019:DUC, author = "Zongheng Yang and Eric Liang and Amog Kamsetty and Chenggang Wu and Yan Duan and Xi Chen and Pieter Abbeel and Joseph M. Hellerstein and Sanjay Krishnan and Ion Stoica", title = "Deep unsupervised cardinality estimation", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "3", pages = "279--292", month = nov, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3368289.3368294", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:13 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cardinality estimation has long been grounded in statistical tools for density estimation. To capture the rich multivariate distributions of relational tables, we propose the use of a new type of high-capacity statistical model: deep autoregressive models. However, direct application of these models leads to a limited estimator that is prohibitively expensive to evaluate for range or wildcard predicates. To produce a truly usable estimator, we develop a Monte Carlo integration scheme on top of autoregressive models that can efficiently handle range queries with dozens of dimensions or more. Like classical synopses, our estimator summarizes the data without supervision. Unlike previous solutions, we approximate the joint data distribution without any independence assumptions. Evaluated on real-world datasets and compared against real systems and dominant families of techniques, our estimator achieves single-digit multiplicative error at tail, an up to 90x accuracy improvement over the second best method, and is space- and runtime-efficient.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ding:2019:FGI, author = "Zeyu Ding and Yuxin Wang and Danfeng Zhang and Daniel Kifer", title = "Free gap information from the differentially private sparse vector and noisy max mechanisms", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "3", pages = "293--306", month = nov, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3368289.3368295", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:13 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Noisy Max and Sparse Vector are selection algorithms for differential privacy and serve as building blocks for more complex algorithms. In this paper we show that both algorithms can release additional information for free (i.e., at no additional privacy cost). Noisy Max is used to return the approximate maximizer among a set of queries. We show that it can also release for free the noisy gap between the approximate maximizer and runner-up. This free information can improve the accuracy of certain subsequent counting queries by up to 50\%. Sparse Vector is used to return a set of queries that are approximately larger than a fixed threshold. We show that it can adaptively control its privacy budget (use less budget for queries that are likely to be much larger than the threshold) in order to increase the amount of queries it can process. These results follow from a careful privacy analysis.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Sun:2019:EEL, author = "Ji Sun and Guoliang Li", title = "An end-to-end learning-based cost estimator", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "3", pages = "307--319", month = nov, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3368289.3368296", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:13 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cost and cardinality estimation is vital to query optimizer, which can guide the query plan selection. However traditional empirical cost and cardinality estimation techniques cannot provide high-quality estimation, because they may not effectively capture the correlation between multiple tables. Recently the database community shows that the learning-based cardinality estimation is better than the empirical methods. However, existing learning-based methods have several limitations. Firstly, they focus on estimating the cardinality, but cannot estimate the cost. Secondly, they are either too heavy or hard to represent complicated structures, e.g., complex predicates. To address these challenges, we propose an effective end-to-end learning-based cost estimation framework based on a tree-structured model, which can estimate both cost and cardinality simultaneously. We propose effective feature extraction and encoding techniques, which consider both queries and physical operations in feature extraction. We embed these features into our tree-structured model. We propose an effective method to encode string values, which can improve the generalization ability for predicate matching. As it is prohibitively expensive to enumerate all string values, we design a patten-based method, which selects patterns to cover string values and utilizes the patterns to embed string values. We conducted experiments on real-world datasets and experimental results showed that our method outperformed baselines.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zeng:2019:LMD, author = "Yuxiang Zeng and Yongxin Tong and Lei Chen", title = "Last-mile delivery made practical: an efficient route planning framework with theoretical guarantees", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "3", pages = "320--333", month = nov, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3368289.3368297", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:13 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Last-mile delivery (LMD) refers to the movement of goods from transportation origins to the final destinations. It has widespread applications such as urban logistics, e-commerce, etc. One fundamental problem in last-mile delivery is route planning, which schedules multiple couriers' routes, i.e., sequences of origins and destinations of the requests under certain optimization objectives. Prior studies usually designed heuristic solutions to two strongly NP-hard optimization objectives: minimizing the makespan ( i.e., maximum travel time) of couriers and total latency ( i.e., waiting time) of requesters. There is no algorithm with theoretical guarantees for either optimization objective in practical cases. In this paper, we propose a theoretically guaranteed solution framework for both objectives. It achieves both approximation ratios of $ 6 \rho $, where $ \rho $ is the approximation ratio of a core operation, called $k$ LMD, which plans for one courier a route consisting of $k$ requests. Leveraging a spatial index called hierarchically separated tree, we further design an efficient approximation algorithm for $k$ LMD with $ \rho = O(\log n)$, where $n$ is the number of requests. Experimental results show that our approach outperforms state-of-the-art methods by averagely 48.4\%--96.0\% and 49.7\%--96.1\% for both objectives. Especially in large-scale real datasets, our algorithm has $ 29.3 \times $--$ 108.9 \times $ shorter makespan and $ 20.2 \times $--$ 175.1 \times $ lower total latency than the state-of-the-art algorithms.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kepe:2019:DPM, author = "Tiago R. Kepe and Eduardo C. de Almeida and Marco A. Z. Alves", title = "Database processing-in-memory: an experimental study", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "3", pages = "334--347", month = nov, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3368289.3368298", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:13 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The rapid growth of ``big-data'' intensified the problem of data movement when processing data analytics: Large amounts of data need to move through the memory up to the CPU before any computation takes place. To tackle this costly problem, Processing-in-Memory (PIM) inverts the traditional data processing by pushing computation to memory with an impact on performance and energy efficiency. In this paper, we present an experimental study on processing database SIMD operators in PIM compared to current x86 processor (i.e., using AVX512 instructions). We discuss the execution time gap between those architectures. However, this is the first experimental study, in the database community, to discuss the trade-offs of execution time and energy consumption between PIM and x86 in the main query execution systems: materialized, vectorized, and pipelined. We also discuss the results of a hybrid query scheduling when interleaving the execution of the SIMD operators between PIM and x86 processing hardware. In our results, the hybrid query plan reduced the execution time by 45\%. It also drastically reduced energy consumption by more than 2x compared to hardware-specific query plans.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Leeka:2019:ISO, author = "Jyoti Leeka and Kaushik Rajan", title = "Incorporating super-operators in big-data query optimizers", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "3", pages = "348--361", month = nov, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3368289.3368299", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:13 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The cost of big-data analytics is dominated by shuffle operations that induce multiple disk reads, writes and network transfers. This paper proposes a new class of optimization rules that are specifically aimed at eliminating shuffles where possible. The rules substitute multiple shuffle inducing operators ( Join, UnionAll, Spool, GroupBy ) with a single streaming operator which implements an entire sub-query. We call such operators super-operators. A key challenge with adding new rules that substitute sub-queries with super-operators is that there are many variants of the same sub-query that can be implemented via minor modifications to the same super-operator. Adding each as a separate rule leads to a search space explosion. We propose several extensions to the query optimizer to address this challenge. We propose a new abstract representation for operator trees that captures all possible sub-queries that a super-operator implements. We propose a new rule matching algorithm that can efficiently search for abstract operator trees. Finally we extend the physical operator interface to introduce new parametric super-operators. We implement our changes in SCOPE, a state-of-the-art production big-data optimizer used extensively at Microsoft. We demonstrate that the proposed optimizations provide significant reduction in both resource cost (average 1.7x) and latency (average 1.5x) on several production queries, and do so without increasing optimization time.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Li:2019:EPM, author = "Conggai Li and Fan Zhang and Ying Zhang and Lu Qin and Wenjie Zhang and Xuemin Lin", title = "Efficient progressive minimum $k$-core search", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "3", pages = "362--375", month = nov, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3368289.3368300", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:13 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As one of the most representative cohesive subgraph models, $k$-core model has recently received significant attention in the literature. In this paper, we investigate the problem of the minimum $k$-core search: given a graph $G$, an integer $k$ and a set of query vertices $ Q = \{ q \} $, we aim to find the smallest $k$-core subgraph containing every query vertex $ q \epsilon Q$. It has been shown that this problem is NP-hard with a huge search space, and it is very challenging to find the optimal solution. There are several heuristic algorithms for this problem, but they rely on simple scoring functions and there is no guarantee as to the size of the resulting subgraph, compared with the optimal solution. Our empirical study also indicates that the size of their resulting subgraphs may be large in practice. In this paper, we develop an effective and efficient progressive algorithm, namely PSA, to provide a good trade-off between the quality of the result and the search time. Novel lower and upper bound techniques for the minimum $k$-core search are designed. Our extensive experiments on 12 real-life graphs demonstrate the effectiveness and efficiency of the new techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhu:2019:HNL, author = "Hang Zhu and Zhihao Bai and Jialin Li and Ellis Michael and Dan R. K. Ports and Ion Stoica and Xin Jin", title = "{Harmonia}: near-linear scalability for replicated storage with in-network conflict detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "3", pages = "376--389", month = nov, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3368289.3368301", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:13 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed storage employs replication to mask failures and improve availability. However, these systems typically exhibit a hard tradeoff between consistency and performance. Ensuring consistency introduces coordination overhead, and as a result the system throughput does not scale with the number of replicas. We present Harmonia, a replicated storage architecture that exploits the capability of new-generation programmable switches to obviate this tradeoff by providing near-linear scalability without sacrificing consistency. To achieve this goal, Harmonia detects read-write conflicts in the network, which enables any replica to serve reads for objects with no pending writes. Harmonia implements this functionality at line rate, thus imposing no performance overhead. We have implemented a prototype of Harmonia on a cluster of commodity servers connected by a Barefoot Tofino switch, and have integrated it with Redis. We demonstrate the generality of our approach by supporting a variety of replication protocols, including primary-backup, chain replication, Viewstamped Replication, and NOPaxos. Experimental results show that Harmonia improves the throughput of these protocols by up to $ 10 \times $ for a replication factor of $ 10 $, providing near-linear scalability up to the limit of our testbed.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Walenz:2019:LSC, author = "Brett Walenz and Stavros Sintos and Sudeepa Roy and Jun Yang", title = "Learning to sample: counting with complex queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "3", pages = "390--402", month = nov, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3368289.3368302", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:13 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the problem of efficiently estimating counts for queries involving complex filters, such as user-defined functions, or predicates involving self-joins and correlated subqueries. For such queries, traditional sampling techniques may not be applicable due to the complexity of the filter preventing sampling over joins, and sampling after the join may not be feasible due to the cost of computing the full join. The other natural approach of training and using an inexpensive classifier to estimate the count instead of the expensive predicate suffers from the difficulties in training a good classifier and giving meaningful confidence intervals. In this paper we propose a new method of learning to sample where we combine the best of both worlds by using sampling in two phases. First, we use samples to learn a probabilistic classifier, and then use the classifier to design a stratified sampling method to obtain the final estimates. We theoretically analyze algorithms for obtaining an optimal stratification, and compare our approach with a suite of natural alternatives like quantification learning, weighted and stratified sampling, and other techniques from the literature. We also provide extensive experiments in diverse use cases using multiple real and synthetic datasets to evaluate the quality, efficiency, and robustness of our approach.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Echihabi:2019:RLH, author = "Karima Echihabi and Kostas Zoumpatianos and Themis Palpanas and Houda Benbrahim", title = "Return of the {Lernaean Hydra}: experimental evaluation of data series approximate similarity search", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "3", pages = "403--420", month = nov, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3368289.3368303", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Dec 11 07:51:13 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data series are a special type of multidimensional data present in numerous domains, where similarity search is a key operation that has been extensively studied in the data series literature. In parallel, the multidimensional community has studied approximate similarity search techniques. We propose a taxonomy of similarity search techniques that reconciles the terminology used in these two domains, we describe modifications to data series indexing techniques enabling them to answer approximate similarity queries with quality guarantees, and we conduct a thorough experimental evaluation to compare approximate similarity search techniques under a unified framework, on synthetic and real datasets in memory and on disk. Although data series differ from generic multidimensional vectors (series usually exhibit correlation between neighboring values), our results show that data series techniques answer approximate queries with strong guarantees and an excellent empirical performance, on data series and vectors alike. These techniques outperform the state-of-the-art approximate techniques for vectors when operating on disk, and remain competitive in memory.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhou:2019:DDI, author = "Xinjing Zhou and Lidan Shou and Ke Chen and Wei Hu and Gang Chen", title = "{DPTree}: differential indexing for persistent memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "4", pages = "421--434", month = dec, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3372716.3372717", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 8 18:50:37 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The emergence of persistent memory (PM) spurs on redesigns of database system components to gain full exploitation of the persistence and speed of the hardware. One crucial component studied by researchers is persistent indices. However, such studies to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Karimov:2019:AAH, author = "Jeyhun Karimov and Tilmann Rabl and Volker Markl", title = "{AJoin}: ad-hoc stream joins at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "4", pages = "435--448", month = dec, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3372716.3372718", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 8 18:50:37 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The processing model of state-of-the-art stream processing engines is designed to execute long-running queries one at a time. However, with the advance of cloud technologies and multi-tenant systems, multiple users share the same cloud for stream query \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Luo:2019:PSL, author = "Chen Luo and Michael J. Carey", title = "On performance stability in {LSM}-based storage systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "4", pages = "449--462", month = dec, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3372716.3372719", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 8 18:50:37 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Log-Structured Merge-Tree (LSM-tree) has been widely adopted for use in modern NoSQL systems for its superior write performance. Despite the popularity of LSM-trees, they have been criticized for suffering from write stalls and large performance \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Peng:2019:TBT, author = "You Peng and Ying Zhang and Xuemin Lin and Wenjie Zhang and Lu Qin and Jingren Zhou", title = "Towards bridging theory and practice: hop-constrained $s$--$t$ simple path enumeration", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "4", pages = "463--476", month = dec, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3372716.3372720", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 8 18:50:37 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph is a ubiquitous structure representing entities and their relationships applied in many areas such as social networks, web graphs, and biological networks. One of the fundamental tasks in graph analytics is to investigate the relations between two \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Zhang:2019:PDS, author = "Yuhao Zhang and Arun Kumar", title = "{Panorama}: a data system for unbounded vocabulary querying over video", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "4", pages = "477--491", month = dec, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3372716.3372721", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 8 18:50:37 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Deep convolutional neural networks (CNNs) achieve state-of-the-art accuracy for many computer vision tasks. But using them for video monitoring applications incurs high computational cost and inference latency. Thus, recent works have studied how to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lakhotia:2019:PTS, author = "Kartik Lakhotia and Rajgopal Kannan and Qing Dong and Viktor Prasanna", title = "Planting trees for scalable and efficient canonical hub labeling", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "4", pages = "492--505", month = dec, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3372716.3372722", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 8 18:50:37 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Hub labeling is widely used to improve the latency and throughput of Point-to-Point Shortest Distance (PPSD) queries in graph databases. However, constructing hub labeling, even via the state-of-the-art Pruned Landmark Labeling (PLL) algorithm is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lahoti:2019:OIF, author = "Preethi Lahoti and Krishna P. Gummadi and Gerhard Weikum", title = "Operationalizing individual fairness with pairwise fair representations", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "4", pages = "506--518", month = dec, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3372716.3372723", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 8 18:50:37 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We revisit the notion of individual fairness proposed by Dwork et al. A central challenge in operationalizing their approach is the difficulty in eliciting a human specification of a similarity metric. In this paper, we propose an operationalization of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kakaraparthy:2019:ODL, author = "Aarati Kakaraparthy and Jignesh M. Patel and Kwanghyun Park and Brian P. Kroth", title = "Optimizing databases by learning hidden parameters of solid state drives", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "4", pages = "519--532", month = dec, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3372716.3372724", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 8 18:50:37 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Solid State Drives (SSDs) are complex devices with varying internal implementations, resulting in subtle differences in behavior between devices. In this paper, we demonstrate how a database engine can be optimized for a particular device by learning \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kang:2019:BOD, author = "Daniel Kang and Peter Bailis and Matei Zaharia", title = "{BlazeIt}: optimizing declarative aggregation and limit queries for neural network-based video analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "4", pages = "533--546", month = dec, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3372716.3372725", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 8 18:50:37 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent advances in neural networks (NNs) have enabled automatic querying of large volumes of video data with high accuracy. While these deep NNs can produce accurate annotations of an object's position and type in video, they are computationally \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Huang:2019:JST, author = "Dawei Huang and Dong Young Yoon and Seth Pettie and Barzan Mozafari", title = "Joins on samples: a theoretical guide for practitioners", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "4", pages = "547--560", month = dec, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3372716.3372726", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 8 18:50:37 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Despite decades of research on AQP (approximate query processing), our understanding of sample-based joins has remained limited and, to some extent, even superficial. The common belief in the community is that joining random samples is futile. This \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Karagiannis:2019:MAK, author = "Georgios Karagiannis and Immanuel Trummer and Saehan Jo and Shubham Khandelwal and Xuezhi Wang and Cong Yu", title = "Mining an ``anti-knowledge base'' from {Wikipedia} updates with applications to fact checking and beyond", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "4", pages = "561--573", month = dec, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3372716.3372727", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 8 18:50:37 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We introduce the problem of anti-knowledge mining. Our goal is to create an {``anti-knowledge base''} that contains factual mistakes. The resulting data can be used for analysis, training, and benchmarking in the research domain of automated fact checking. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Lersch:2019:EPM, author = "Lucas Lersch and Xiangpeng Hao and Ismail Oukid and Tianzheng Wang and Thomas Willhalm", title = "Evaluating persistent memory range indexes", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "4", pages = "574--587", month = dec, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3372716.3372728", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jan 8 18:50:37 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Persistent memory (PM) is fundamentally changing the way database index structures are built by enabling persistence, high performance, and (near) instant recovery all on the memory bus. Prior work has proposed many techniques to tailor index structure \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Goldstein:2020:MBR, author = "Jonathan Goldstein and Ahmed Abdelhamid and Mike Barnett and Sebastian Burckhardt and Badrish Chandramouli and Darren Gehring and Niel Lebeck and Christopher Meiklejohn and Umar Farooq Minhas and Ryan Newton and Rahee Ghosh Peshawaria and Tal Zaccai and Irene Zhang", title = "{A.M.B.R.O.S.I.A}: providing performant virtual resiliency for distributed applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "5", pages = "588--601", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3377369.3377370", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:27 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377370", abstract = "When writing today's distributed programs, which frequently span both devices and cloud services, programmers are faced with complex decisions and coding tasks around coping with failure, especially when these distributed components are stateful. If \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ouyang:2020:ESP, author = "Dian Ouyang and Long Yuan and Lu Qin and Lijun Chang and Ying Zhang and Xuemin Lin", title = "Efficient shortest path index maintenance on dynamic road networks with theoretical guarantees", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "5", pages = "602--615", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3377369.3377371", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:27 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377371", abstract = "Computing the shortest path between two vertices is a fundamental problem in road networks that is applied in a wide variety of applications. To support efficient shortest path query processing, a plethora of index-based methods have been proposed in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Stehle:2020:PMP, author = "Elias Stehle and Hans-Arno Jacobsen", title = "{ParPaRaw}: massively parallel parsing of delimiter-separated raw data", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "5", pages = "616--628", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3377369.3377372", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:27 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377372", abstract = "Parsing is essential for a wide range of use cases, such as stream processing, bulk loading, and in-situ querying of raw data. Yet, the compute-intense step often constitutes a major bottleneck in the data ingestion pipeline, since parsing of inputs \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2020:OOC, author = "Yihe Huang and William Qian and Eddie Kohler and Barbara Liskov and Liuba Shrira", title = "Opportunities for optimism in contended main-memory multicore transactions", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "5", pages = "629--642", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3377369.3377373", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:27 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377373", abstract = "Optimistic concurrency control, or OCC, can achieve excellent performance on uncontended workloads for main-memory transactional databases. Contention causes OCC's performance to degrade, however, and recent concurrency control designs, such as hybrid \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zheng:2020:PLF, author = "Bolong Zheng and Xi Zhao and Lianggui Weng and Nguyen Quoc Viet Hung and Hang Liu and Christian S. Jensen", title = "{PM-LSH}: a fast and accurate {LSH} framework for high-dimensional approximate {NN} search", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "5", pages = "643--655", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3377369.3377374", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:27 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377374", abstract = "Nearest neighbor (NN) search in high-dimensional spaces is inherently computationally expensive due to the curse of dimensionality. As a well-known solution to approximate NN search, locality-sensitive hashing (LSH) is able to answer c-approximate NN (c-\ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2020:HMB, author = "Yahui Sun and Jun Luo and Theodoros Lappas and Xiaokui Xiao and Bin Cui", title = "Hunting multiple bumps in graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "5", pages = "656--669", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3377369.3377375", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:27 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377375", abstract = "Bump hunting is an important approach to the extraction of insights from Euclidean datasets. Recently, it has been explored for graph datasets for the first time, and a single bump is hunted in an unweighted graph in this exploration. Here, we extend \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2020:HNE, author = "Renchi Yang and Jieming Shi and Xiaokui Xiao and Yin Yang and Sourav S. Bhowmick", title = "Homogeneous network embedding for massive graphs via reweighted personalized {PageRank}", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "5", pages = "670--683", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3377369.3377376", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:27 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377376", abstract = "Given an input graph G and a node $ v \in G $, homogeneous network embedding (HNE) maps the graph structure in the vicinity of $v$ to a compact, fixed-dimensional feature vector. This paper focuses on HNE for massive graphs, e.g., with billions of edges. On \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qahtan:2020:PFD, author = "Abdulhakim Qahtan and Nan Tang and Mourad Ouzzani and Yang Cao and Michael Stonebraker", title = "Pattern functional dependencies for data cleaning", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "5", pages = "684--697", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3377369.3377377", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:27 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377377", abstract = "Patterns (or regex-based expressions) are widely used to constrain the format of a domain (or a column), e.g., a Year column should contain only four digits, and thus a value like ``1980-'' might be a typo. Moreover, integrity constraints (ICs) defined \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Whang:2020:MMV, author = "Joyce Jiyoung Whang and Rundong Du and Sangwon Jung and Geon Lee and Barry Drake and Qingqing Liu and Seonggoo Kang and Haesun Park", title = "{MEGA}: multi-view semi-supervised clustering of hypergraphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "5", pages = "698--711", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3377369.3377378", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:27 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377378", abstract = "Complex relationships among entities can be modeled very effectively using hypergraphs. Hypergraphs model real-world data by allowing a hyperedge to include two or more entities. Clustering of hypergraphs enables us to group the similar entities \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Koumarelas:2020:MDD, author = "Ioannis Koumarelas and Thorsten Papenbrock and Felix Naumann", title = "{MDedup}: duplicate detection with matching dependencies", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "5", pages = "712--725", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3377369.3377379", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:27 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377379", abstract = "Duplicate detection is an integral part of data cleaning and serves to identify multiple representations of same real-world entities in (relational) datasets. Existing duplicate detection approaches are effective, but they are also hard to parameterize \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tran:2020:PVU, author = "Van-Dang Tran and Hiroyuki Kato and Zhenjiang Hu", title = "Programmable view update strategies on relations", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "5", pages = "726--739", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3377369.3377380", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:27 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377380", abstract = "View update is an important mechanism that allows updates on a view by translating them into the corresponding updates on the base relations. The existing literature has shown the ambiguity of translating view updates. To address this ambiguity, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kumar:2020:ADD, author = "Avinash Kumar and Zuozhi Wang and Shengquan Ni and Chen Li", title = "{Amber}: a debuggable dataflow system based on the actor model", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "5", pages = "740--753", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3377369.3377381", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:27 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377381", abstract = "A long-running analytic task on big data often leaves a developer in the dark without providing valuable feedback about the status of the execution. In addition, a failed job that needs to restart from scratch can waste earlier computing resources. An \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schiavio:2020:DSO, author = "Filippo Schiavio and Daniele Bonetta and Walter Binder", title = "Dynamic speculative optimizations for {SQL} compilation in {Apache Spark}", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "5", pages = "754--767", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3377369.3377382", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:27 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377382", abstract = "Big-data systems have gained significant momentum, and Apache Spark is becoming a de-facto standard for modern data analytics. Spark relies on SQL query compilation to optimize the execution performance of analytical workloads on a variety of data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Khayati:2020:MGE, author = "Mourad Khayati and Alberto Lerner and Zakhar Tymchenko and Philippe Cudr{\'e}-Mauroux", title = "Mind the gap: an experimental evaluation of imputation of missing values techniques in time series", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "5", pages = "768--782", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3377369.3377383", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:27 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377383", abstract = "Recording sensor data is seldom a perfect process. Failures in power, communication or storage can leave occasional blocks of data missing, affecting not only real-time monitoring but also compromising the quality of near- and off-line data analysis. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mofrad:2020:GNA, author = "Mohammad Hasanzadeh Mofrad and Rami Melhem and Yousuf Ahmad and Mohammad Hammoud", title = "{Graphite}: a {NUMA}-aware {HPC} system for graph analytics based on a new {MPI * X} parallelism model", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "6", pages = "783--797", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3380750.3380751", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:28 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380751", abstract = "In this paper, we propose a new parallelism model denoted as MPI * X and suggest a linear algebra-based graph analytics system, namely, Graphite, which effectively employs it. MPI * X promotes thread-based partitioning to distribute computation and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Boer:2020:PIA, author = "Naama Boer and Daniel Deutch and Nave Frost and Tova Milo", title = "Personal insights for altering decisions of tree-based ensembles over time", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "6", pages = "798--811", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3380750.3380752", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:28 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380752", abstract = "Machine Learning models are prevalent in critical human-related decision making, such as resume filtering and loan applications. Refused individuals naturally ask what could change the decision, should they reapply. This question is hard for the model \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Peng:2020:ABS, author = "You Peng and Ying Zhang and Xuemin Lin and Lu Qin and Wenjie Zhang", title = "Answering billion-scale label-constrained reachability queries within microsecond", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "6", pages = "812--825", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3380750.3380753", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:28 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380753", abstract = "In this paper, we study the problem of label-constrained reachability (LCR) query which is fundamental in many applications with directed edge-label graphs. Although the classical reachability query (i.e., reachability query without label constraint) \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2020:EER, author = "Ruihong Huang and Shaoxu Song and Yunsu Lee and Jungho Park and Soo-Hyung Kim and Sungmin Yi", title = "Effective and efficient retrieval of structured entities", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "6", pages = "826--839", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3380750.3380754", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:28 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380754", abstract = "Structured entities are commonly abstracted, such as from XML, RDF or hidden-web databases. Direct retrieval of various structured entities is highly demanded in data lakes, e.g., given a JSON object, to find the XML entities that denote the same real-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sirin:2020:MAA, author = "Utku Sirin and Anastasia Ailamaki", title = "Micro-architectural analysis of {OLAP}: limitations and opportunities", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "6", pages = "840--853", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3380750.3380755", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:28 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380755", abstract = "Understanding micro-architectural behavior is important for efficiently using hardware resources. Recent work has shown that in-memory online transaction processing (OLTP) systems severely underutilize their core micro-architecture resources [29]. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fang:2020:EEC, author = "Yixiang Fang and Yixing Yang and Wenjie Zhang and Xuemin Lin and Xin Cao", title = "Effective and efficient community search over large heterogeneous information networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "6", pages = "854--867", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3380750.3380756", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:28 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380756", abstract = "Recently, the topic of community search (CS) has gained plenty of attention. Given a query vertex, CS looks for a dense subgraph that contains it. Existing studies mainly focus on homogeneous graphs in which vertices are of the same type, and cannot be \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gupta:2020:RGS, author = "Suyash Gupta and Sajjad Rahnama and Jelle Hellings and Mohammad Sadoghi", title = "{ResilientDB}: global scale resilient blockchain fabric", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "6", pages = "868--883", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3380750.3380757", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:28 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380757", abstract = "Recent developments in blockchain technology have inspired innovative new designs in resilient distributed and database systems. At their core, these blockchain applications typically use Byzantine fault-tolerant consensus protocols to maintain a common \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Funke:2020:DPQ, author = "Henning Funke and Jens Teubner", title = "Data-parallel query processing on non-uniform data", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "6", pages = "884--897", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3380750.3380758", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:28 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380758", abstract = "Graphics processing units (GPUs) promise spectacular performance advantages when used as database coprocessors. Their massive compute capacity, however, is often hampered by control flow divergence caused by non-uniform data distributions. When data-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Feng:2020:EMH, author = "Zonghao Feng and Qiong Luo", title = "Evaluating memory-hard proof-of-work algorithms on three processors", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "6", pages = "898--911", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3380750.3380759", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:28 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380759", abstract = "Most public blockchain systems, exemplified by cryptocurrencies such as Ethereum and Monero, use memory-hard proof-of-work (PoW) algorithms in consensus protocols to maintain fair participation without a trusted third party. The memory hardness, or the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lee:2020:ASW, author = "Seokki Lee and Bertram Lud{\"a}scher and Boris Glavic", title = "Approximate summaries for why and why-not provenance", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "6", pages = "912--924", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3380750.3380760", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:28 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380760", abstract = "Why and why-not provenance have been studied extensively in recent years. However, why-not provenance and --- to a lesser degree --- why provenance can be very large, resulting in severe scalability and usability challenges. We introduce a novel \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jiang:2020:PAD, author = "Hao Jiang and Chunwei Liu and Qi Jin and John Paparrizos and Aaron J. Elmore", title = "{PIDS}: attribute decomposition for improved compression and query performance in columnar storage", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "6", pages = "925--938", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3380750.3380761", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:28 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380761", abstract = "We propose PIDS, Pattern Inference Decomposed Storage, an innovative storage method for decomposing string attributes in columnar stores. Using an unsupervised approach, PIDS identifies common patterns in string attributes from relational databases, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Asudeh:2020:DCP, author = "Abolfazl Asudeh and H. V. Jagadish and You (Will) Wu and Cong Yu", title = "On detecting cherry-picked trendlines", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "6", pages = "939--952", month = feb, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3380750.3380762", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Apr 2 10:51:28 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380762", abstract = "Poorly supported stories can be told based on data by cherry-picking the data points included. While such stories may be technically accurate, they are misleading. In this paper, we build a system for detecting cherry-picking, with a focus on trendlines \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ota:2020:DDD, author = "Masayo Ota and Heiko M{\"u}ller and Juliana Freire and Divesh Srivastava", title = "Data-driven domain discovery for structured datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "7", pages = "953--965", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3384345.3384346", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384346", abstract = "The growing number of open datasets has created new opportunities to derive insights and address important societal problems. These data, however, often come with little or no metadata, in particular about the types of their attributes, thus greatly \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shi:2020:RIF, author = "Jieming Shi and Tianyuan Jin and Renchi Yang and Xiaokui Xiao and Yin Yang", title = "Realtime index-free single source {SimRank} processing on web-scale graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "7", pages = "966--978", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3384345.3384347", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384347", abstract = "Given a graph $G$ and a node $ u \in G$, a single source SimRank query evaluates the similarity between $u$ and every node $ v \in G$. Existing approaches to single source SimRank computation incur either long query response time, or expensive pre-computation, which \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2020:DAR, author = "Jiachuan Wang and Peng Cheng and Libin Zheng and Chao Feng and Lei Chen and Xuemin Lin and Zheng Wang", title = "Demand-aware route planning for shared mobility services", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "7", pages = "979--991", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3384345.3384348", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384348", abstract = "The dramatic development of shared mobility in food delivery, ridesharing, and crowdsourced parcel delivery has drawn great concerns. Specifically, shared mobility refers to transferring or delivering more than one passenger/package together when their \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hilprecht:2020:DLD, author = "Benjamin Hilprecht and Andreas Schmidt and Moritz Kulessa and Alejandro Molina and Kristian Kersting and Carsten Binnig", title = "{DeepDB}: learn from data, not from queries!", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "7", pages = "992--1005", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3384345.3384349", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384349", abstract = "The typical approach for learned DBMS components is to capture the behavior by running a representative set of queries and use the observations to train a machine learning model. This workload-driven approach, however, has two major downsides. First, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2020:DMU, author = "Yuepeng Wang and Rushi Shah and Abby Criswell and Rong Pan and Isil Dillig", title = "Data migration using datalog program synthesis", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "7", pages = "1006--1019", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3384345.3384350", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384350", abstract = "This paper presents a new technique for migrating data between different schemas. Our method expresses the schema mapping as a Datalog program and automatically synthesizes a Datalog program from simple input-output examples to perform data migration. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2020:LTG, author = "Xiaowei Zhu and Guanyu Feng and Marco Serafini and Xiaosong Ma and Jiping Yu and Lei Xie and Ashraf Aboulnaga and Wenguang Chen", title = "{LiveGraph}: a transactional graph storage system with purely sequential adjacency list scans", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "7", pages = "1020--1034", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3384345.3384351", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384351", abstract = "The specific characteristics of graph workloads make it hard to design a one-size-fits-all graph storage system. Systems that support transactional updates use data structures with poor data locality, which limits the efficiency of analytical workloads \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2020:KKB, author = "Xueling Lin and Haoyang Li and Hao Xin and Zijian Li and Lei Chen", title = "{KBPearl}: a knowledge base population system supported by joint entity and relation linking", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "7", pages = "1035--1049", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3384345.3384352", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384352", abstract = "Nowadays, most openly available knowledge bases (KBs) are incomplete, since they are not synchronized with the emerging facts happening in the real world. Therefore, knowledge base population (KBP) from external data sources, which extracts knowledge \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2020:CUT, author = "Tianyi Li and Ruikai Huang and Lu Chen and Christian S. Jensen and Torben Bach Pedersen", title = "Compression of uncertain trajectories in road networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "7", pages = "1050--1063", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3384345.3384353", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384353", abstract = "Massive volumes of uncertain trajectory data are being generated by GPS devices. Due to the limitations of GPS data, these trajectories are generally uncertain. This state of affairs renders it is attractive to be able to compress uncertain trajectories \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shastri:2020:UBI, author = "Supreeth Shastri and Vinay Banakar and Melissa Wasserman and Arun Kumar and Vijay Chidambaram", title = "Understanding and benchmarking the impact of {GDPR} on database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "7", pages = "1064--1077", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3384345.3384354", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384354", abstract = "The General Data Protection Regulation (GDPR) provides new rights and protections to European people concerning their personal data. We analyze GDPR from a systems perspective, translating its legal articles into a set of capabilities and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2020:LOP, author = "Jihang Liu and Shimin Chen and Lujun Wang", title = "{LB+Trees}: optimizing persistent index performance on {$3$DXPoint} memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "7", pages = "1078--1090", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3384345.3384355", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384355", abstract = "3DXPoint memory is the first commercially available NVM solution targeting mainstream computer systems. While 3DXPoint conforms to many assumptions about NVM in previous studies, we observe a number of distinctive features of 3DXPoint. For example, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lersch:2020:ELT, author = "Lucas Lersch and Ivan Schreter and Ismail Oukid and Wolfgang Lehner", title = "Enabling low tail latency on multicore key-value stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "7", pages = "1091--1104", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3384345.3384356", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384356", abstract = "Modern applications employ key-value stores (KVS) in at least some point of their software stack, often as a caching system or a storage manager. Many of these applications also require a high degree of responsiveness and performance predictability. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2020:PAA, author = "Chunbin Lin and Etienne Boursier and Yannis Papakonstantinou", title = "{Plato}: approximate analytics over compressed time series with tight deterministic error guarantees", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "7", pages = "1105--1118", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3384345.3384357", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384357", abstract = "Plato provides fast approximate analytics on time series, by precomputing and storing compressed time series. Plato's key novelty is the delivery of tight deterministic error guarantees for the linear algebra operators over vectors\slash time series, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gera:2020:TLG, author = "Prasun Gera and Hyojong Kim and Piyush Sao and Hyesoon Kim and David Bader", title = "Traversing large graphs on {GPUs} with unified memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "7", pages = "1119--1133", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3384345.3384358", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384358", abstract = "Due to the limited capacity of GPU memory, the majority of prior work on graph applications on GPUs has been restricted to graphs of modest sizes that fit in memory. Recent hardware and software advances make it possible to address much larger host \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ping:2020:SHQ, author = "Haoyue Ping and Julia Stoyanovich and Benny Kimelfeld", title = "Supporting hard queries over probabilistic preferences", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "7", pages = "1134--1146", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3384345.3384359", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384359", abstract = "Preference analysis is widely applied in various domains such as social choice and e-commerce. A recently proposed framework augments the relational database with a preference relation that represents uncertain preferences in the form of statistical \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lu:2020:DSH, author = "Baotong Lu and Xiangpeng Hao and Tianzheng Wang and Eric Lo", title = "{Dash}: scalable hashing on persistent memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "8", pages = "1147--1161", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3389133.3389134", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389134", abstract = "Byte-addressable persistent memory (PM) brings hash tables the potential of low latency, cheap persistence and instant recovery. The recent advent of Intel Optane DC Persistent Memory Modules (DCPMM) further accelerates this trend. Many new hash table designs have been proposed, but most of them were based on emulation and perform sub-optimally on real PM. They were also piece-wise and partial solutions that side-step many important properties, in particular good scalability, high load factor and instant recovery.\par We present Dash, a holistic approach to building dynamic and scalable hash tables on real PM hardware with all the aforementioned properties. Based on Dash, we adapted two popular dynamic hashing schemes (extendible hashing and linear hashing). On a 24-core machine with Intel Optane DCPMM, we show that compared to state-of-the-art, Dash-enabled hash tables can achieve up to $ \approx 3.9 \times $ higher performance with up to over 90\ load factor and an instant recovery time of 57ms regardless of data size.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ferragina:2020:PIF, author = "Paolo Ferragina and Giorgio Vinciguerra", title = "The {PGM-index}: a fully-dynamic compressed learned index with provable worst-case bounds", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "8", pages = "1162--1175", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3389133.3389135", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389135", abstract = "We present the first learned index that supports predecessor, range queries and updates within provably efficient time and space bounds in the worst case. In the (static) context of just predecessor and range queries these bounds turn out to be optimal. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ma:2020:DRC, author = "Minghua Ma and Zheng Yin and Shenglin Zhang and Sheng Wang and Christopher Zheng and Xinhao Jiang and Hanwen Hu and Cheng Luo and Yilin Li and Nengjun Qiu and Feifei Li and Changcheng Chen and Dan Pei", title = "Diagnosing root causes of intermittent slow queries in cloud databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "8", pages = "1176--1189", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3389133.3389136", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389136", abstract = "With the growing market of cloud databases, careful detection and elimination of slow queries are of great importance to service stability. Previous studies focus on optimizing the slow queries that result from internal reasons (e.g., poorly-written \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2020:PEF, author = "Xuhao Chen and Roshan Dathathri and Gurbinder Gill and Keshav Pingali", title = "{Pangolin}: an efficient and flexible graph mining system on {CPU} and {GPU}", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "8", pages = "1190--1205", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3389133.3389137", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389137", abstract = "There is growing interest in graph pattern mining (GPM) problems such as motif counting. GPM systems have been developed to provide unified interfaces for programming algorithms for these problems and for running them on parallel systems. However, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dreseler:2020:QTH, author = "Markus Dreseler and Martin Boissier and Tilmann Rabl and Matthias Uflacker", title = "Quantifying {TPC-H} choke points and their optimizations", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "8", pages = "1206--1220", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3389133.3389138", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389138", abstract = "TPC-H continues to be the most widely used benchmark for relational OLAP systems. It poses a number of challenges, also known as ``choke points'', which database systems have to solve in order to achieve good benchmark results. Examples include joins \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2020:EAC, author = "Yuanbing Li and Xian Wu and Yifei Jin and Jian Li and Guoliang Li", title = "Efficient algorithms for crowd-aided categorization", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "8", pages = "1221--1233", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3389133.3389139", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389139", abstract = "We study the problem of utilizing human intelligence to categorize a large number of objects. In this problem, given a category hierarchy and a set of objects, we can ask humans to check whether an object belongs to a category, and our goal is to find \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2020:SVD, author = "Shaowei Wang and Yuqiu Qian and Jiachun Du and Wei Yang and Liusheng Huang and Hongli Xu", title = "Set-valued data publication with local privacy: tight error bounds and efficient mechanisms", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "8", pages = "1234--1247", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3389133.3389140", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389140", abstract = "Most user-generated data in online services are presented as set-valued data, e.g., visited website URLs, recently used Apps by a person, and etc. These data are of great value to service providers, but also bring privacy concerns if collected and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fegaras:2020:TAB, author = "Leonidas Fegaras and Hasanuzzaman Noor", title = "Translation of array-based loops to distributed data-parallel programs", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "8", pages = "1248--1260", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3389133.3389141", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389141", abstract = "Large volumes of data generated by scientific experiments and simulations come in the form of arrays, while programs that analyze these data are frequently expressed in terms of array operations in an imperative, loop-based language. But, as datasets \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2020:IGP, author = "Wenfei Fan and Muyang Liu and Chao Tian and Ruiqi Xu and Jingren Zhou", title = "Incrementalization of graph partitioning algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "8", pages = "1261--1274", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3389133.3389142", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389142", abstract = "This paper studies incremental graph partitioning. Given a (vertex-cut or edge-cut) partition $ C(G) $ of a graph $G$ and updates $ \Delta G$ to $G$, it is to compute changes $ \Delta O$ to $ C(G)$, yielding a partition of the updated graph such that (a) the new partition is load-\ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ko:2020:OIS, author = "Shao-Heng Ko and Hsu-Chao Lai and Hong-Han Shuai and Wang-Chien Lee and Philip S. Yu and De-Nian Yang", title = "Optimizing item and subgroup configurations for social-aware {VR} shopping", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "8", pages = "1275--1289", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3389133.3389143", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389143", abstract = "Shopping in VR malls has been regarded as a paradigm shift for E-commerce, but most of the conventional VR shopping platforms are designed for a single user. In this paper, we envisage a scenario of VR group shopping, which brings major advantages over \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Savvides:2020:ECP, author = "Savvas Savvides and Darshika Khandelwal and Patrick Eugster", title = "Efficient confidentiality-preserving data analytics over symmetrically encrypted datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "8", pages = "1290--1303", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3389133.3389144", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389144", abstract = "In the past decade, cloud computing has emerged as an economical and practical alternative to in-house datacenters. But due to security concerns, many enterprises are still averse to adopting third party clouds. To mitigate these concerns, several \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gill:2020:SMG, author = "Gurbinder Gill and Roshan Dathathri and Loc Hoang and Ramesh Peri and Keshav Pingali", title = "Single machine graph analytics on massive datasets using {Intel Optane DC Persistent Memory}", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "8", pages = "1304--1318", month = apr, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3389133.3389145", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 5 14:01:14 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389145", abstract = "Intel Optane DC Persistent Memory (Optane PMM) is a new kind of byte-addressable memory with higher density and lower cost than DRAM. This enables the design of affordable systems that support up to 6TB of randomly accessible memory. In this paper, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zakhary:2020:ACA, author = "Victor Zakhary and Divyakant Agrawal and Amr {El Abbadi}", title = "Atomic commitment across blockchains", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1319--1331", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397231", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397231", abstract = "The recent adoption of blockchain technologies and open permissionless networks suggest the importance of peer-to-peer atomic cross-chain transaction protocols. Users should be able to atomically exchange tokens and assets without depending on \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mathew:2020:HSM, author = "Ajit Mathew and Changwoo Min", title = "{HydraList}: a scalable in-memory index using asynchronous updates and partial replication", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1332--1345", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397232", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397232", abstract = "Increased capacity of main memory has led to the rise of in-memory databases. With disk access eliminated, efficiency of index structures has become critical for performance in these systems. An ideal index structure should exhibit high performance for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Davis:2020:EMP, author = "A. Jesse Jiryu Davis and Max Hirschhorn and Judah Schvimer", title = "Extreme modelling in practice", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1346--1358", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397233", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397233", abstract = "Formal modelling is a powerful tool for developing complex systems. At MongoDB, we use TLA$^+$ to model and verify multiple aspects of several systems. Ensuring conformance between a specification and its implementation can add value to any specification; \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lyu:2020:MBS, author = "Bingqing Lyu and Lu Qin and Xuemin Lin and Ying Zhang and Zhengping Qian and Jingren Zhou", title = "Maximum biclique search at billion scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1359--1372", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397234", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397234", abstract = "Maximum biclique search, which finds the biclique with the maximum number of edges in a bipartite graph, is a fundamental problem with a wide spectrum of applications in different domains, such as E-Commerce, social analysis, web services, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chepurko:2020:AAR, author = "Nadiia Chepurko and Ryan Marcus and Emanuel Zgraggen and Raul Castro Fernandez and Tim Kraska and David Karger", title = "{ARDA}: automatic relational data augmentation for machine learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1373--1387", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397235", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397235", abstract = "Automatic machine learning (AML) is a family of techniques to automate the process of training predictive models, aiming to both improve performance and make machine learning more accessible. While many recent works have focused on aspects of the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Alkowaileet:2020:LBT, author = "Wail Y. Alkowaileet and Sattam Alsubaiee and Michael J. Carey", title = "An {LSM}-based tuple compaction framework for {Apache AsterixDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1388--1400", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397236", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397236", abstract = "Document database systems store self-describing semi-structured records, such as JSON, ``as-is'' without requiring the users to pre-define a schema. This provides users with the flexibility to change the structure of incoming records without worrying \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shraga:2020:ACD, author = "Roee Shraga and Avigdor Gal and Haggai Roitman", title = "{ADnEV}: cross-domain schema matching using deep similarity matrix adjustment and evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1401--1415", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397237", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397237", abstract = "Schema matching is a process that serves in integrating structured and semi-structured data. Being a handy tool in multiple contemporary business and commerce applications, it has been investigated in the fields of databases, AI, Semantic Web, and data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2020:QPP, author = "Xuanhe Zhou and Ji Sun and Guoliang Li and Jianhua Feng", title = "Query performance prediction for concurrent queries using graph embedding", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1416--1428", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397238", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397238", abstract = "Query performance prediction is vital to many database tasks (e.g., database monitoring and query scheduling). Existing methods focus on predicting the performance for a single query but cannot effectively predict the performance for concurrent queries, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Alquraan:2020:SNZ, author = "Ahmed Alquraan and Alex Kogan and Virendra J. Marathe and Samer Al-Kiswany", title = "Scalable, near-zero loss disaster recovery for distributed data stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1429--1442", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397239", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397239", abstract = "This paper presents a new Disaster Recovery (DR) system, called Slogger, that differs from prior works in two principle ways: (i) Slogger enables DR for a linearizable distributed data store, and (ii) Slogger adopts the continuous backup approach that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lu:2020:VAN, author = "Kejing Lu and Hongya Wang and Wei Wang and Mineichi Kudo", title = "{VHP}: approximate nearest neighbor search via virtual hypersphere partitioning", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1443--1455", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397240", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397240", abstract = "Locality sensitive hashing (LSH) is a widely practiced c -approximate nearest neighbor( c -ANN) search algorithm in high dimensional spaces. The state-of-the-art LSH based algorithm searches an unbounded and irregular space to identify candidates, which \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kim:2020:IFS, author = "Hyunjoon Kim and Seunghwan Min and Kunsoo Park and Xuemin Lin and Seok-Hee Hong and Wook-Shin Han", title = "{IDAR}: fast supergraph search using {DAG} integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1456--1468", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397241", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397241", abstract = "Supergraph search is one of fundamental graph query processing problems in many application domains. Given a query graph and a set of data graphs, supergraph search is to find all the data graphs contained in the query graph as subgraphs. In existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Seleznova:2020:GEU, author = "Mariia Seleznova and Behrooz Omidvar-Tehrani and Sihem Amer-Yahia and Eric Simon", title = "Guided exploration of user groups", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1469--1482", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397242", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397242", abstract = "Finding a set of users of interest serves several applications in behavioral analytics. Often times, identifying users requires to explore the data and gradually choose potential targets. This is a special case of Exploratory Data Analysis (EDA), an \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gong:2020:IID, author = "Long Gong and Huayi Wang and Mitsunori Ogihara and Jun Xu", title = "{iDEC}: indexable distance estimating codes for approximate nearest neighbor search", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1483--1497", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397243", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397243", abstract = "Approximate Nearest Neighbor (ANN) search is a fundamental algorithmic problem, with numerous applications in many areas of computer science. In this work, we propose indexable distance estimating codes (iDEC), a new solution framework to ANN that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bian:2020:EAB, author = "Song Bian and Qintian Guo and Sibo Wang and Jeffrey Xu Yu", title = "Efficient algorithms for budgeted influence maximization on massive social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1498--1510", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397244", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397244", abstract = "Given a social network G, a cost associated with each node, and a budget B, the budgeted influence maximization (BIM) problem aims to find a set S of nodes, denoted as the seed set, that maximizes the expected number of influenced users under the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Prateek:2020:MTK, author = "Arneish Prateek and Arijit Khan and Akshit Goyal and Sayan Ranu", title = "Mining Top-$k$ pairs of correlated subgraphs in a large network", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1511--1524", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397245", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397245", abstract = "We investigate the problem of correlated subgraphs mining (CSM) where the goal is to identify pairs of subgraph patterns that frequently co-occur in proximity within a single graph. Correlated subgraph patterns are different from frequent subgraphs due \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Buchnik:2020:FHT, author = "Yehonatan Buchnik and Roy Friedman", title = "{FireLedger}: a high throughput blockchain consensus protocol", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1525--1539", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397246", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397246", abstract = "Blockchains are distributed secure ledgers to which transactions are issued continuously and each block of transactions is tightly coupled to its predecessors. Permissioned blockchains place special emphasis on transactions throughput. In this paper we \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2020:PEF, author = "Kefei Wang and Jian Liu and Feng Chen", title = "Put an elephant into a fridge: optimizing cache efficiency for in-memory key--value stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1540--1554", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397247", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397247", abstract = "In today's data centers, memory-based key-value systems, such as Memcached and Redis, play an indispensable role in providing high-speed data services. The rapidly growing capacity and quickly falling price of DRAM memory in the past years have enabled \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pedersen:2020:ASR, author = "Simon Aagaard Pedersen and Bin Yang and Christian S. Jensen", title = "Anytime stochastic routing with hybrid learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1555--1567", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397248", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397248", abstract = "Increasingly massive volumes of vehicle trajectory data hold the potential to enable higher-resolution traffic services than hitherto possible. We use trajectory data to create a high-resolution, uncertain road-network graph, where edges are associated \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2020:UED, author = "Qizhen Zhang and Yifan Cai and Xinyi Chen and Sebastian Angel and Ang Chen and Vincent Liu and Boon Thau Loo", title = "Understanding the effect of data center resource disaggregation on production {DBMSs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1568--1581", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397249", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397249", abstract = "Resource disaggregation is a new architecture for data centers in which resources like memory and storage are decoupled from the CPU, managed independently, and connected through a high-speed network. Recent work has shown that although disaggregated \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tziavelis:2020:OAR, author = "Nikolaos Tziavelis and Deepak Ajwani and Wolfgang Gatterbauer and Mirek Riedewald and Xiaofeng Yang", title = "Optimal algorithms for ranked enumeration of answers to full conjunctive queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1582--1597", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397250", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397250", abstract = "We study ranked enumeration of join-query results according to very general orders defined by selective dioids. Our main contribution is a framework for ranked enumeration over a class of dynamic programming problems that generalizes seemingly different \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dhulipala:2020:SPS, author = "Laxman Dhulipala and Charles McGuffey and Hongbo Kang and Yan Gu and Guy E. Blelloch and Phillip B. Gibbons and Julian Shun", title = "{Sage}: parallel semi-asymmetric graph algorithms for {NVRAMs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "9", pages = "1598--1613", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3397230.3397251", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Jul 8 18:23:01 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397251", abstract = "Non-volatile main memory (NVRAM) technologies provide an attractive set of features for large-scale graph analytics, including byte-addressability, low idle power, and improved memory-density. NVRAM systems today have an order of magnitude more NVRAM \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2020:PIN, author = "Yuqing Zhu and Jing Tang and Xueyan Tang", title = "Pricing influential nodes in online social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "10", pages = "1614--1627", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3401960.3401961", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:36:56 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3401960.3401961", abstract = "Influential nodes with rich connections in online social networks (OSNs) are of great values to initiate marketing campaigns. However, the potential influence spread that can be generated by these influential nodes is hidden behind the structures of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2020:KSA, author = "Bintao Sun and Maximilien Danisch and T-H. Hubert Chan and Mauro Sozio", title = "{KClist++}: a simple algorithm for finding $k$-clique densest subgraphs in large graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "10", pages = "1628--1640", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3401960.3401962", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:36:56 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3401960.3401962", abstract = "The problem of finding densest subgraphs has received increasing attention in recent years finding applications in biology, finance, as well as social network analysis. The k -clique densest subgraph problem is a generalization of the densest subgraph \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wellenzohn:2020:DIC, author = "Kevin Wellenzohn and Michael H. B{\"o}hlen and Sven Helmer", title = "Dynamic interleaving of content and structure for robust indexing of semi-structured hierarchical data", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "10", pages = "1641--1653", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3401960.3401963", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:36:56 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3401960.3401963", abstract = "We propose a robust index for semi-structured hierarchical data that supports content-and-structure (CAS) queries specified by path and value predicates. At the heart of our approach is a novel dynamic interleaving scheme that merges the path and value \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Agarwal:2020:CGS, author = "Shubhangi Agarwal and Sourav Dutta and Arnab Bhattacharya", title = "{ChiSeL}: graph similarity search using chi-squared statistics in large probabilistic graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "10", pages = "1654--1668", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3401960.3401964", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:36:56 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3401960.3401964", abstract = "Subgraph querying is one of the most important primitives in many applications. Although the field is well studied for deterministic graphs, in many situations, the graphs are probabilistic in nature. In this paper, we address the problem of subgraph \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tan:2020:FID, author = "Zijing Tan and Ai Ran and Shuai Ma and Sheng Qin", title = "Fast incremental discovery of pointwise order dependencies", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "10", pages = "1669--1681", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3401960.3401965", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:36:56 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3401960.3401965", abstract = "Pointwise order dependencies (PODs) are dependencies that specify ordering semantics on attributes of tuples. POD discovery refers to the process of identifying the set $ \Sigma $ of valid and minimal PODs on a given data set D. In practice D is typically large \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Livshits:2020:ADC, author = "Ester Livshits and Alireza Heidari and Ihab F. Ilyas and Benny Kimelfeld", title = "Approximate denial constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "10", pages = "1682--1695", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3401960.3401966", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:36:56 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3401960.3401966", abstract = "The problem of mining integrity constraints from data has been extensively studied over the past two decades for commonly used types of constraints, including the classic Functional Dependencies (FDs) and the more general Denial Constraints (DCs). In \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rehrmann:2020:SOO, author = "Robin Rehrmann and Carsten Binnig and Alexander B{\"o}hm and Kihong Kim and Wolfgang Lehner", title = "Sharing opportunities for {OLTP} workloads in different isolation levels", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "10", pages = "1696--1708", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3401960.3401967", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:36:56 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3401960.3401967", abstract = "OLTP applications are usually executed by a high number of clients in parallel and are typically faced with high throughput demand as well as a constraint latency requirement for individual statements. Interestingly, OLTP workloads are often read-heavy \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Neumann:2020:BBM, author = "Stefan Neumann and Pauli Miettinen", title = "Biclustering and {Boolean} matrix factorization in data streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "10", pages = "1709--1722", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3401960.3401968", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:36:56 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3401960.3401968", abstract = "We study clustering of bipartite graphs and Boolean matrix factorization in data streams. We consider a streaming setting in which the vertices from the left side of the graph arrive one by one together with all of their incident edges. We provide an \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jian:2020:EER, author = "Xun Jian and Yue Wang and Lei Chen", title = "Effective and efficient relational community detection and search in large dynamic heterogeneous information networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "10", pages = "1723--1736", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3401960.3401969", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:36:56 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3401960.3401969", abstract = "Community search in heterogeneous information networks (HINs) has attracted much attention in graph analysis. Given a vertex, the goal is to find a densely-connected sub-graph that contains the vertex. In practice, the user may need to restrict the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kim:2020:NLS, author = "Hyeonji Kim and Byeong-Hoon So and Wook-Shin Han and Hongrae Lee", title = "Natural language to {SQL}: where are we today?", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "10", pages = "1737--1750", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3401960.3401970", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:36:56 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3401960.3401970", abstract = "Translating natural language to SQL (NL2SQL) has received extensive attention lately, especially with the recent success of deep learning technologies. However, despite the large number of studies, we do not have a thorough understanding of how good \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Che:2020:ATD, author = "Yulin Che and Zhuohang Lai and Shixuan Sun and Yue Wang and Qiong Luo", title = "Accelerating truss decomposition on heterogeneous processors", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "10", pages = "1751--1764", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3401960.3401971", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:36:56 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3401960.3401971", abstract = "Truss decomposition is to divide a graph into a hierarchy of subgraphs, or trusses. A subgraph is a k -truss ( k {$>$}= 2) if each edge is in at least k --- 2 triangles in the subgraph. Existing algorithms work by first counting the number of triangles each \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mukherjee:2020:SDS, author = "Rohan Mukherjee and Swarat Chaudhuri and Chris Jermaine", title = "Searching a database of source codes using contextualized code search", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "10", pages = "1765--1778", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3401960.3401972", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:36:56 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3401960.3401972", abstract = "Consider the case where a programmer has written some part of a program, but has left part of the program (such as a method or a function body) incomplete. The goal is to use the context surrounding the missing code to automatically ``figure out'' which \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2020:DSE, author = "Yan Li and Tingjian Ge and Cindy Chen", title = "Data stream event prediction based on timing knowledge and state transitions", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "10", pages = "1779--1792", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3401960.3401973", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:36:56 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3401960.3401973", abstract = "We study a practical problem of predicting the upcoming events in data streams using a novel approach. Treating event time orders as relationship types between event entities, we build a dynamic knowledge graph and use it to predict future event timing. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{McSherry:2020:SAP, author = "Frank McSherry and Andrea Lattuada and Malte Schwarzkopf and Timothy Roscoe", title = "Shared arrangements: practical inter-query sharing for streaming dataflows", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "10", pages = "1793--1806", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3401960.3401974", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:36:56 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3401960.3401974", abstract = "Current systems for data-parallel, incremental processing and view maintenance over high-rate streams isolate the execution of independent queries. This creates unwanted redundancy and overhead in the presence of concurrent incrementally maintained \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } %%% [29-Oct-2022] TO DO: v13n11 is not yet published @Article{Gupta:2020:SBD, author = "Peeyush Gupta and Michael J. Carey and Sharad Mehrotra and oberto Yus", title = "{SmartBench}: a benchmark for data management in smart spaces", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "1807--1820", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407791", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407791", abstract = "This paper proposes SmartBench, a benchmark focusing on queries resulting from (near) real-time applications and longer-term analysis of IoT data. SmartBench, derived from a deployed smart building monitoring system, is comprised of: (1) An extensible \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Boniol:2020:SGB, author = "Paul Boniol and Themis Palpanas", title = "{Series2Graph}: graph-based subsequence anomaly detection for time series", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "1821--1834", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407792", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407792", abstract = "Subsequence anomaly detection in long sequences is an important problem with applications in a wide range of domains. However, the approaches that have been proposed so far in the literature have severe limitations: they either require prior domain \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2020:SCS, author = "Dan Zhang and Madelon Hulsebos and Yoshihiko Suhara and {\c{C}}agatay Demiralp and Jinfeng Li and Wang-Chiew Tan", title = "{Sato}: contextual semantic type detection in tables", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "1835--1848", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407793", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407793", abstract = "Detecting the semantic types of data columns in relational tables is important for various data preparation and information retrieval tasks such as data cleaning, schema matching, data discovery, and semantic search. However, existing detection \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{He:2020:TTP, author = "Qijian He and Wei Yang and Bingren Chen and Yangyang Geng and Liusheng Huang", title = "{TransNet}: training privacy-preserving neural network over transformed layer", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "1849--1862", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407794", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407794", abstract = "The accuracy of neural network can be improved by training over multi-participants' pooled dataset, but privacy problem of sharing sensitive data obstructs this collaborative learning. To solve this contradiction, we propose TransNet, a novel solution \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2020:CAG, author = "Wenfei Fan and Ruochun Jin and Muyang Liu and Ping Lu and Chao Tian and Jingren Zhou", title = "Capturing associations in graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "1863--1876", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407795", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407795", abstract = "This paper proposes a class of graph association rules, denoted by GARs, to specify regularities between entities in graphs. A GAR is a combination of a graph pattern and a dependency; it may take as predicates ML (machine learning) classifiers for link \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Renz-Wieland:2020:DPA, author = "Alexander Renz-Wieland and Rainer Gemulla and Steffen Zeuch and Volker Markl", title = "Dynamic parameter allocation in parameter servers", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "1877--1890", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407796", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407796", abstract = "To keep up with increasing dataset sizes and model complexity, distributed training has become a necessity for large machine learning tasks. Parameter servers ease the implementation of distributed parameter management---a key concern in distributed \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Freitag:2020:AWC, author = "Michael Freitag and Maximilian Bandle and Tobias Schmidt and Alfons Kemper and Thomas Neumann", title = "Adopting worst-case optimal joins in relational database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "1891--1904", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407797", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407797", abstract = "Worst-case optimal join algorithms are attractive from a theoretical point of view, as they offer asymptotically better runtime than binary joins on certain types of queries. In particular, they avoid enumerating large intermediate results by processing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{McKenna:2020:WAM, author = "Ryan McKenna and Raj Kumar Maity and Arya Mazumdar and Gerome Miklau", title = "A workload-adaptive mechanism for linear queries under local differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "1905--1918", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407798", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407798", abstract = "We propose a new mechanism to accurately answer a user-provided set of linear counting queries under local differential privacy (LDP). Given a set of linear counting queries (the workload) our mechanism automatically adapts to provide accuracy on the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2020:SSP, author = "Yisu Remy Wang and Shana Hutchison and Jonathan Leang and Bill Howe and Dan Suciu", title = "{SPORES}: sum-product optimization via relational equality saturation for large scale linear algebra", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "1919--1932", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407799", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407799", abstract = "Machine learning algorithms are commonly specified in linear algebra (LA). LA expressions can be rewritten into more efficient forms, by taking advantage of input properties such as sparsity, as well as program properties such as common subexpressions. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fernandez:2020:DMP, author = "Raul Castro Fernandez and Pranav Subramaniam and Michael J. Franklin", title = "Data market platforms: trading data assets to solve data problems", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "1933--1947", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407800", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407800", abstract = "Data only generates value for a few organizations with expertise and resources to make data shareable, discoverable, and easy to integrate. Sharing data that is easy to discover and integrate is hard because data owners lack information (who needs what \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mahdavi:2020:BEE, author = "Mohammad Mahdavi and Ziawasch Abedjan", title = "{Baran}: effective error correction via a unified context representation and transfer learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "1948--1961", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407801", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407801", abstract = "Traditional error correction solutions leverage handmaid rules or master data to find the correct values. Both are often amiss in real-world scenarios. Therefore, it is desirable to additionally learn corrections from a limited number of example \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2020:RDS, author = "Ju Fan and Junyou Chen and Tongyu Liu and Yuwei Shen and Guoliang Li and Xiaoyong Du", title = "Relational data synthesis using generative adversarial networks: a design space exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "1962--1975", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407802", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407802", abstract = "The proliferation of big data has brought an urgent demand for privacy-preserving data publishing. Traditional solutions to this demand have limitations on effectively balancing the tradeoff between privacy and utility of the released data. Thus, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2020:LLP, author = "Lei Yang and Hong Wu and Tieying Zhang and Xuntao Cheng and Feifei Li and Lei Zou and Yujie Wang and Rongyao Chen and Jianying Wang and Gui Huang", title = "{Leaper}: a learned prefetcher for cache invalidation in {LSM}-tree based storage engines", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "1976--1989", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407803", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407803", abstract = "Frequency-based cache replacement policies that work well on page-based database storage engines are no longer sufficient for the emerging LSM-tree ( Log-Structure Merge-tree ) based storage engines. Due to the append-only and copy-on-write techniques \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kang:2020:ASG, author = "Daniel Kang and Edward Gan and Peter Bailis and Tatsunori Hashimoto and Matei Zaharia", title = "Approximate selection with guarantees using proxies", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "1990--2003", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407804", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407804", abstract = "Due to the falling costs of data acquisition and storage, researchers and industry analysts often want to find all instances of rare events in large datasets. For instance, scientists can cheaply capture thousands of hours of video, but are limited by \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kang:2020:EIC, author = "Minji Kang and Soyee Choi and Gihwan Oh and Sang-Won Lee", title = "{2R}: efficiently isolating cold pages in flash storages", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2004--2017", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407805", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407805", abstract = "Given skewed writes common in databases, the conventional 1R-Greedy FTL incurs huge write amplification, most of which is contributed by cold pages amounting to 80\% of data. Since 1R-Greedy manages all flash blocks in one region at no type distinction, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bashardoost:2020:KT, author = "Bahar Ghadiri Bashardoost and Ren{\'e}e J. Miller and Kelly Lyons and Fatemeh Nargesian", title = "Knowledge translation", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2018--2032", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407806", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407806", abstract = "We introduce Kensho, a tool for generating mapping rules between two Knowledge Bases (KBs). To create the mapping rules, Kensho starts with a set of correspondences and enriches them with additional semantic information automatically identified from the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Petersohn:2020:TSD, author = "Devin Petersohn and Stephen Macke and Doris Xin and William Ma and Doris Lee and Xiangxi Mo and Joseph E. Gonzalez and Joseph M. Hellerstein and Anthony D. Joseph and Aditya Parameswaran", title = "Towards scalable dataframe systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2033--2046", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407807", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407807", abstract = "Dataframes are a popular abstraction to represent, prepare, and analyze data. Despite the remarkable success of dataframe libraries in R and Python, dataframes face performance issues even on moderately large datasets. Moreover, there is significant \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lu:2020:AFP, author = "Yi Lu and Xiangyao Yu and Lei Cao and Samuel Madden", title = "{Aria}: a fast and practical deterministic {OLTP} database", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2047--2060", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407808", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407808", abstract = "Deterministic databases are able to efficiently run transactions across different replicas without coordination. However, existing state-of-the-art deterministic databases require that transaction read/write sets are known before execution, making such \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Miao:2020:COS, author = "Dongjing Miao and Zhipeng Cai and Jianzhong Li and Xiangyu Gao and Xianmin Liu", title = "The computation of optimal subset repairs", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2061--2074", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407809", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407809", abstract = "Computing an optimal subset repair of an inconsistent database is becoming a standalone research problem and has a wide range of applications. However, it has not been well-studied yet. A tight inapproximability bound of the problem computing optimal \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Christodoulakis:2020:PPB, author = "Christina Christodoulakis and Eric B. Munson and Moshe Gabel and Angela Demke Brown and Ren{\'e}e J. Miller", title = "{Pytheas}: pattern-based table discovery in {CSV} files", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2075--2089", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407810", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407810", abstract = "CSV is a popular Open Data format widely used in a variety of domains for its simplicity and effectiveness in storing and disseminating data. Unfortunately, data published in this format often does not conform to strict specifications, making automated \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2020:PPV, author = "Yuncheng Wu and Shaofeng Cai and Xiaokui Xiao and Gang Chen and Beng Chin Ooi", title = "Privacy preserving vertical federated learning for tree-based models", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2090--2103", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407811", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407811", abstract = "Federated learning (FL) is an emerging paradigm that enables multiple organizations to jointly train a model without revealing their private data to each other. This paper studies vertical federated learning, which tackles the scenarios where (i) \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Al-Baghdadi:2020:TBC, author = "Ahmed Al-Baghdadi and Xiang Lian", title = "Topic-based community search over spatial-social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2104--2117", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407812", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407812", abstract = "Recently, the community search problem has attracted significant attention, due to its wide spectrum of real-world applications such as event organization, friend recommendation, advertisement in e-commence, and so on. Given a query vertex, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fritz:2020:LME, author = "Manuel Fritz and Michael Behringer and Holger Schwarz", title = "{LOG-Means}: efficiently estimating the number of clusters in large datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2118--2131", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407813", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407813", abstract = "Clustering is a fundamental primitive in manifold applications. In order to achieve valuable results, parameters of the clustering algorithm, e.g., the number of clusters, have to be set appropriately, which is a tremendous pitfall. To this end, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Krastnikov:2020:EOD, author = "Simeon Krastnikov and Florian Kerschbaum and Douglas Stebila", title = "Efficient oblivious database joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2132--2145", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407814", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407814", abstract = "A major algorithmic challenge in designing applications intended for secure remote execution is ensuring that they are oblivious to their inputs, in the sense that their memory access patterns do not leak sensitive information to the server. This \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Issa:2020:ETQ, author = "Ousmane Issa and Angela Bonifati and Farouk Toumani", title = "Evaluating top-$k$ queries with inconsistency degrees", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2146--2158", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407815", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407815", abstract = "We study the problem of augmenting relational tuples with inconsistency awareness and tackling top-k queries under a set of denial constraints (DCs). We define a notion of inconsistent tuples with respect to a set of DCs and define two measures of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Nakandala:2020:CDS, author = "Supun Nakandala and Yuhao Zhang and Arun Kumar", title = "{Cerebro}: a data system for optimized deep learning model selection", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2159--2173", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407816", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", note = "See errata \cite{Nakandala:2021:ECD}.", URL = "https://dl.acm.org/doi/10.14778/3407790.3407816", abstract = "Deep neural networks (deep nets) are revolutionizing many machine learning (ML) applications. But there is a major bottleneck to wider adoption: the pain and resource intensiveness of model selection. This empirical process involves exploring deep net \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gan:2020:COP, author = "Edward Gan and Peter Bailis and Moses Charikar", title = "{CoopStore}: optimizing precomputed summaries for aggregation", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2174--2187", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407817", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407817", abstract = "An emerging class of data systems partition their data and precompute approximate summaries (i.e., sketches and samples) for each segment to reduce query costs. They can then aggregate and combine the segment summaries to estimate results without \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Koide:2020:FSS, author = "Satoshi Koide and Chuan Xiao and Yoshiharu Ishikawa", title = "Fast subtrajectory similarity search in road networks under weighted edit distance constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2188--2201", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407818", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407818", abstract = "In this paper, we address a similarity search problem for spatial trajectories in road networks. In particular, we focus on the subtrajectory similarity search problem, which involves finding in a database the subtrajectories similar to a query \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2020:SAG, author = "Yu Liu and Lei Zou and Qian Ge and Zhewei Wei", title = "{SimTab}: accuracy-guaranteed {SimRank} queries through tighter confidence bounds and multi-armed bandits", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2202--2214", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407819", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407819", abstract = "SimRank is a classic measure of vertex-pair similarity according to the structure of graphs. Top-$k$ and thresholding SimRank queries are two important types of similarity search with numerous applications in web mining, social network analysis, spam \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dutt:2020:EAS, author = "Anshuman Dutt and Chi Wang and Vivek Narasayya and Surajit Chaudhuri", title = "Efficiently approximating selectivity functions using low overhead regression models", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2215--2228", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407820", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407820", abstract = "Today's query optimizers use fast selectivity estimation techniques but are known to be susceptible to large estimation errors. Recent work on supervised learned models for selectivity estimation significantly improves accuracy while ensuring relatively \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2020:IID, author = "Yin Lin and Yifan Guan and Abolfazl Asudeh and H. V. Jagadish", title = "Identifying insufficient data coverage in databases with multiple relations", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2229--2242", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407821", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407821", abstract = "In today's data-driven world, it is critical that we use appropriate datasets for analysis and decision-making. Datasets could be biased because they reflect existing inequalities in the world, due to the data scientists' biased world view, or due to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2020:CMA, author = "Lingxiao Li and Muhammad Aamir Cheema and Mohammed Eunus Ali and Hua Lu and David Taniar", title = "Continuously monitoring alternative shortest paths on road networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2243--2255", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407822", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407822", abstract = "Modern navigation systems do not only provide shortest paths but also some alternative paths to provide more options to the users. This paper is the first to study the problem of continuously reporting alternative paths for a user traveling along a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lee:2020:HMC, author = "Geon Lee and Jihoon Ko and Kijung Shin", title = "Hypergraph motifs: concepts, algorithms, and discoveries", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2256--2269", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407823", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407823", abstract = "Hypergraphs naturally represent group interactions, which are omnipresent in many domains: collaborations of researchers, co-purchases of items, joint interactions of proteins, to name a few. In this work, we propose tools for answering the following \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Birnick:2020:HSE, author = "Johann Birnick and Thomas Bl{\"a}sius and Tobias Friedrich and Felix Naumann and Thorsten Papenbrock and Martin Schirneck", title = "Hitting set enumeration with partial information for unique column combination discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2270--2283", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407824", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407824", abstract = "Unique column combinations (UCCs) are a fundamental concept in relational databases. They identify entities in the data and support various data management activities. Still, UCCs are usually not explicitly defined and need to be discovered. State-of-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2020:SDS, author = "Yue Chen and Zhida Chen and Gao Cong and Ahmed R. Mahmood and Walid G. Aref", title = "{SSTD}: a distributed system on streaming spatio-textual data", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2284--2296", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407825", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407825", abstract = "Streaming spatio-textual data that contains geolocations and textual contents, e.g., geo-tagged tweets, is becoming increasingly available. Users can register continuous queries to receive up-to-date results continuously, or pose snapshot queries to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mohammed:2020:CPI, author = "Haneen Mohammed and Ziyun Wei and Eugene Wu and Ravi Netravali", title = "Continuous prefetch for interactive data applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2297--2311", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407826", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407826", abstract = "Interactive data visualization and exploration (DVE) applications are often network-bottlenecked due to bursty request patterns, large response sizes, and heterogeneous deployments over a range of networks and devices. This makes it difficult to ensure \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2020:EES, author = "Zheng Wang and Cheng Long and Gao Cong and Yiding Liu", title = "Efficient and effective similar subtrajectory search with deep reinforcement learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2312--2325", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407827", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407827", abstract = "Similar trajectory search is a fundamental problem and has been well studied over the past two decades. However, the similar subtrajectory search (SimSub) problem, aiming to return a portion of a trajectory (i.e., a subtrajectory), which is the most \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2020:BSE, author = "Zequn Sun and Qingheng Zhang and Wei Hu and Chengming Wang and Muhao Chen and Farahnaz Akrami and Chengkai Li", title = "A benchmarking study of embedding-based entity alignment for knowledge graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2326--2340", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407828", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407828", abstract = "Entity alignment seeks to find entities in different knowledge graphs (KGs) that refer to the same real-world object. Recent advancement in KG embedding impels the advent of embedding-based entity alignment, which encodes entities in a continuous \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qi:2020:ELS, author = "Jianzhong Qi and Guanli Liu and Christian S. Jensen and Lars Kulik", title = "Effectively learning spatial indices", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2341--2354", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407829", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407829", abstract = "Machine learning, especially deep learning, is used increasingly to enable better solutions for data management tasks previously solved by other means, including database indexing. A recent study shows that a neural network can not only learn to predict \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2020:SLB, author = "Qiyu Liu and Libin Zheng and Yanyan Shen and Lei Chen", title = "Stable learned bloom filters for data streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2355--2367", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407830", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407830", abstract = "Bloom filter and its variants are elegant space-efficient probabilistic data structures for approximate set membership queries. It has been recently shown that the space cost of Bloom filters can be significantly reduced via a combination with pre-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jin:2020:ATL, author = "Zhongjun Jin and Yeye He and Surajit Chauduri", title = "{Auto-transform}: learning-to-transform by patterns", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2368--2381", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407831", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407831", abstract = "Data Transformation is a long-standing problem in data management. Recent work adopts a ``transform-by-example'' (TBE) paradigm to infer transformation programs based on user-provided input/output examples, which greatly improves usability, and brought \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kossmann:2020:MMM, author = "Jan Kossmann and Stefan Halfpap and Marcel Jankrift and Rainer Schlosser", title = "Magic mirror in my hand, which is the best in the land?: an experimental evaluation of index selection algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2382--2395", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407832", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407832", abstract = "Indexes are essential for the efficient processing of database workloads. Proposed solutions for the relevant and challenging index selection problem range from metadata-based simple heuristics, over sophisticated multi-step algorithms, to approaches \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Damme:2020:MAQ, author = "Patrick Damme and Annett Ungeth{\"u}m and Johannes Pietrzyk and Alexander Krause and Dirk Habich and Wolfgang Lehner", title = "{MorphStore}: analytical query engine with a holistic compression-enabled processing model", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2396--2410", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407833", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407833", abstract = "In this paper, we present MorphStore, an open-source in-memory columnar analytical query engine with a novel holistic compression-enabled processing model. Basically, compression using lightweight integer compression algorithms already plays an \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Parchas:2020:FED, author = "Panos Parchas and Yonatan Naamad and Peter {Van Bouwel} and Christos Faloutsos and Michalis Petropoulos", title = "Fast and effective distribution-key recommendation for {Amazon Redshift}", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2411--2423", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407834", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407834", abstract = "How should we split data among the nodes of a distributed data warehouse in order to boost performance for a forecasted workload? In this paper, we study the effect of different data partitioning schemes on the overall network cost of pairwise joins. We \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pappachan:2020:SMA, author = "Primal Pappachan and Roberto Yus and Sharad Mehrotra and Johann-Christoph Freytag", title = "{Sieve}: a middleware approach to scalable access control for database management systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2424--2437", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407835", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407835", abstract = "Current approaches for enforcing Fine Grained Access Control (FGAC) in DBMS do not scale to scenarios when the number of access control policies are in the order of thousands. This paper identifies such a use case in the context of emerging smart spaces \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sreekanti:2020:CSF, author = "Vikram Sreekanti and Chenggang Wu and Xiayue Charles Lin and Johann Schleier-Smith and Joseph E. Gonzalez and Joseph M. Hellerstein and Alexey Tumanov", title = "{Cloudburst}: stateful functions-as-a-service", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2438--2452", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407836", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407836", abstract = "Function-as-a-Service (FaaS) platforms and ``serverless'' cloud computing are becoming increasingly popular due to ease-of-use and operational simplicity. Current FaaS offerings are targeted at stateless functions that do minimal I/O and communication. We \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Suprem:2020:OAD, author = "Abhijit Suprem and Joy Arulraj and Calton Pu and Joao Ferreira", title = "{ODIN}: automated drift detection and recovery in video analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2453--2465", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407837", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407837", abstract = "Recent advances in computer vision have led to a resurgence of interest in visual data analytics. Researchers are developing systems for effectively and efficiently analyzing visual data at scale. A significant challenge that these systems encounter \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Piao:2020:MRA, author = "Chengzhi Piao and Weiguo Zheng and Yu Rong and Hong Cheng", title = "Maximizing the reduction ability for near-maximum independent set computation", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2466--2478", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407838", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407838", abstract = "Finding the maximum independent set is a fundamental NP-hard problem in graph theory. Recent studies have paid much attention to designing efficient algorithms that find a maximal independent set of good quality (the more vertices the better). \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2020:FTA, author = "Zhao Chen and Peng Cheng and Lei Chen and Xuemin Lin and Cyrus Shahabi", title = "Fair task assignment in spatial crowdsourcing", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2479--2492", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407839", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407839", abstract = "With the pervasiveness of mobile devices, wireless broadband and sharing economy, spatial crowdsourcing is becoming part of our daily life. Existing studies on spatial crowdsourcing usually focus on enhancing the platform interests and customer \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2020:DSC, author = "Hao Zhang and Jeffrey Xu Yu and Yikai Zhang and Kangfei Zhao and Hong Cheng", title = "Distributed subgraph counting: a general approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2493--2507", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407840", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407840", abstract = "In this paper, we study local subgraph counting, which is to count the occurrences of a user-given pattern graph p around every node v in a data graph G, when v matches to a given orbit o in p, where the orbit serves as a center to count p. In general, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Karagiannis:2020:SMI, author = "Georgios Karagiannis and Mohammed Saeed and Paolo Papotti and Immanuel Trummer", title = "{Scrutinizer}: a mixed-initiative approach to large-scale, data-driven claim verification", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2508--2521", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407841", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407841", abstract = "Organizations spend significant amounts of time and money to manually fact check text documents summarizing data. The goal of the Scrutinizer system is to reduce verification overheads by supporting human fact checkers in translating text claims into \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Krivosheev:2020:DPC, author = "Evgeny Krivosheev and Siarhei Bykau and Fabio Casati and Sunil Prabhakar", title = "Detecting and preventing confused labels in crowdsourced data", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2522--2535", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407842", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407842", abstract = "Crowdsourcing is a challenging activity for many reasons, from task design to workers' training, identification of low-quality annotators, and many more. A particularly subtle form of error is due to confusion of observations, that is, crowd workers \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2020:OHC, author = "Rong-Hua Li and Sen Gao and Lu Qin and Guoren Wang and Weihua Yang and Jeffrey Xu Yu", title = "Ordering heuristics for $k$-clique listing", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2536--2548", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407843", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407843", abstract = "Listing all k -cliques in a graph is a fundamental graph mining problem that finds many important applications in community detection and social network analysis. Unfortunately, the problem of k -clique listing is often deemed infeasible for a large k, as \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2020:DSM, author = "Jinfeng Li and Yuliang Li and Xiaolan Wang and Wang-Chiew Tan", title = "Deep or simple models for semantic tagging?: it depends on your data", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2549--2562", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407844", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407844", abstract = "Semantic tagging, which has extensive applications in text mining, predicts whether a given piece of text conveys the meaning of a given semantic tag. The problem of semantic tagging is largely solved with supervised learning and today, deep learning \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bilal:2020:DBC, author = "Muhammad Bilal and Marco Serafini and Marco Canini and Rodrigo Rodrigues", title = "Do the best cloud configurations grow on trees?: an experimental evaluation of black box algorithms for optimizing cloud workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2563--2575", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407845", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407845", abstract = "Cloud configuration optimization is the procedure to determine the number and the type of instances to use when deploying an application in cloud environments, given a cost or performance objective. In the absence of a performance model for the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2020:FLD, author = "Alexander Zhou and Yue Wang and Lei Chen", title = "Finding large diverse communities on networks: the edge maximum $ k*$-partite clique", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2576--2589", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407846", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407846", abstract = "In this work we examine the problem of finding large, diverse communities on graphs where the users are separated into distinct groups. More specifically, this work considers diversity to be the inclusion of users from multiple groups as opposed to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{vanderLinde:2020:PCS, author = "Albert van der Linde and Jo{\~a}o Leit{\~a}o and Nuno Pregui{\c{c}}a", title = "Practical client-side replication: weak consistency semantics for insecure settings", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2590--2605", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407847", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407847", abstract = "Client-side replication and direct client-to-client synchronization can be used to create highly available, low-latency interactive applications. Causal consistency, the strongest available consistency model under network partitions, is an attractive \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rong:2020:APS, author = "Kexin Rong and Yao Lu and Peter Bailis and Srikanth Kandula and Philip Levis", title = "Approximate partition selection for big-data workloads using summary statistics", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2606--2619", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407848", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407848", abstract = "Many big-data clusters store data in large partitions that support access at a coarse, partition-level granularity. As a result, approximate query processing via row-level sampling is inefficient, often requiring reads of many partitions. In this work, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Winter:2020:MMH, author = "Christian Winter and Tobias Schmidt and Thomas Neumann and Alfons Kemper", title = "Meet me halfway: split maintenance of continuous views", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2620--2633", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407849", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407849", abstract = "From Industry 4.0-driven factories to real-time trading algorithms, businesses depend on analytics on high-velocity real-time data. Often these analytics are performed not in dedicated stream processing engines but on views within a general-purpose \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2020:UPB, author = "Youmin Chen and Youyou Lu and Kedong Fang and Qing Wang and Jiwu Shu", title = "{uTree}: a persistent {B+-tree} with low tail latency", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2634--2648", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407850", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407850", abstract = "Tail latency is a critical design issue in recent storage systems. B$^+$ -tree, as a fundamental building block in storage systems, incurs high tail latency, especially when placed in persistent memory (PM). Our empirical study specifies two factors that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Boncz:2020:FFR, author = "Peter Boncz and Thomas Neumann and Viktor Leis", title = "{FSST}: fast random access string compression", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2649--2661", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407851", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407851", abstract = "Strings are prevalent in real-world data sets. They often occupy a large fraction of the data and are slow to process. In this work, we present Fast Static Symbol Table (FSST), a lightweight compression scheme for strings. On text data, FSST offers \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Vogel:2020:MBC, author = "Lukas Vogel and Viktor Leis and Alexander van Renen and Thomas Neumann and Satoshi Imamura and Alfons Kemper", title = "{Mosaic}: a budget-conscious storage engine for relational database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2662--2675", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407852", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407852", abstract = "Relational database systems are purpose-built for a specific storage device class (e.g., HDD, SSD, or DRAM). They do not cope well with the multitude of storage devices that are competitive at their price `sweet spots'. To make use of different storage \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Blanusa:2020:MCE, author = "Jovan Blanusa and Radu Stoica and Paolo Ienne and Kubilay Atasu", title = "Manycore clique enumeration with fast set intersections", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2676--2690", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407853", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407853", abstract = "Listing all maximal cliques of a given graph has important applications in the analysis of social and biological networks. Parallelisation of maximal clique enumeration (MCE) algorithms on modern manycore processors is challenging due to the task-level \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bater:2020:SPP, author = "Johes Bater and Yongjoo Park and Xi He and Xiao Wang and Jennie Rogers", title = "{SAQE}: practical privacy-preserving approximate query processing for data federations", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2691--2705", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407854", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407854", abstract = "A private data federation enables clients to query the union of data from multiple data providers without revealing any extra private information to the client or any other data providers. Unfortunately, this strong end-to-end privacy guarantee requires \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kuhlman:2020:RAA, author = "Caitlin Kuhlman and Elke Rundensteiner", title = "Rank aggregation algorithms for fair consensus", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2706--2719", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407855", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407855", abstract = "Aggregating multiple rankings in a database is an important task well studied by the database community. High-stakes application domains include hiring, lending, and education where multiple decision makers rank candidates and their input is then \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Glasbergen:2020:SUA, author = "Brad Glasbergen and Michael Abebe and Khuzaima Daudjee and Amit Levi", title = "{Sentinel}: universal analysis and insight for data systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2720--2733", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407856", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407856", abstract = "Systems continue to grow in complexity in response to the need to support vast quantities of data and a wide variety of workloads. Small changes in workloads and system configuration can result in significantly different system behaviour and performance \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fang:2020:ODC, author = "Jingzhi Fang and Yanyan Shen and Yue Wang and Lei Chen", title = "Optimizing {DNN} computation graph using graph substitutions", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2734--2746", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407857", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407857", abstract = "Deep learning has achieved great success in various real-world applications. As deep neural networks (DNNs) are getting larger, the inference and training cost of DNNs increases significantly. Since one round of inference or one iteration in the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sen:2020:ANL, author = "Jaydeep Sen and Chuan Lei and Abdul Quamar and Fatma {\"O}zcan and Vasilis Efthymiou and Ayushi Dalmia and Greg Stager and Ashish Mittal and Diptikalyan Saha and Karthik Sankaranarayanan", title = "{ATHENA++}: natural language querying for complex nested {SQL} queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2747--2759", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407858", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407858", abstract = "Natural Language Interfaces to Databases (NLIDB) systems eliminate the requirement for an end user to use complex query languages like SQL, by translating the input natural language (NL) queries to SQL automatically. Although a significant volume of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xu:2020:CAD, author = "Min Xu and Bolin Ding and Tianhao Wang and Jingren Zhou", title = "Collecting and analyzing data jointly from multiple services under local differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2760--2772", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407859", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407859", abstract = "Users' sensitive data can be collected and analyzed under local differential privacy (LDP) without the need to trust the data collector. Most previous work on LDP can be applied when each user's data is generated and collected from a single service or \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gan:2020:IDA, author = "Yifan Gan and Xueyuan Ren and Drew Ripberger and Spyros Blanas and Yang Wang", title = "{IsoDiff}: debugging anomalies caused by weak isolation", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2773--2786", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407860", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407860", abstract = "Weak isolation levels, such as Read Committed and Snapshot Isolation, are widely used by databases for their higher concurrency, but may introduce subtle correctness errors in applications that only experts can identify. This paper proposes IsoDiff, a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Barsky:2020:SRN, author = "Marina Barsky and Jonathan Gabor and Mariano P. Consens and Alex Thomo", title = "Suffix rank: a new scalable algorithm for indexing large string collections", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "2787--2800", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3407790.3407861", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:33:57 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3407790.3407861", abstract = "We investigate the problem of building a suffix array substring index for inputs significantly larger than main memory. This problem is especially important in the context of biological sequence analysis, where biological polymers can be thought of as \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } %%% Page gap at journal Web site in v13n12 pp 2801--3424 @Article{Haritsa:2020:RQP, author = "Jayant R. Haritsa", title = "Robust query processing: mission possible", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "3425--3428", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3415478.3415561", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:06:01 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Robust query processing with strong performance guarantees is an extremely desirable objective in the design of industrial-strength database engines. However, it has proved to be a largely intractable and elusive challenge in spite of sustained efforts \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Whang:2020:DCQ, author = "Steven Euijong Whang and Jae-Gil Lee", title = "Data collection and quality challenges for deep learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "3429--3432", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3415478.3415562", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:06:01 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Software 2.0 refers to the fundamental shift in software engineering where using machine learning becomes the new norm in software with the availability of big data and computing infrastructure. As a result, many software engineering practices need to be \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Burdick:2020:TEU, author = "Douglas Burdick and Marina Danilevsky and Alexandre V Evfimievski and Yannis Katsis and Nancy Wang", title = "Table extraction and understanding for scientific and enterprise applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "3433--3436", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3415478.3415563", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:06:01 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Valuable high-precision data are often published in the form of tables in both scientific and business documents. While humans can easily identify, interpret and contextualize tables, developing general-purpose automated techniques for extraction of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qin:2020:SQP, author = "Jianbin Qin and Wei Wang and Chuan Xiao and Ying Zhang", title = "Similarity query processing for high-dimensional data", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "3437--3440", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3415478.3415564", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:06:01 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Similarity query processing has been an active research topic for several decades. It is an essential procedure in a wide range of applications. Recently, embedding and auto-encoding methods as well as pre-trained models have gained popularity. They \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gupta:2020:BHT, author = "Suyash Gupta and Jelle Hellings and Sajjad Rahnama and Mohammad Sadoghi", title = "Building high throughput permissioned blockchain fabrics: challenges and opportunities", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "3441--3444", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3415478.3415565", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:06:01 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Since the introduction of Bitcoin---the first widespread application driven by blockchains---the interest in the design of blockchain-based applications has increased tremendously. At the core of these applications are consensus protocols that securely \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Asudeh:2020:FES, author = "Abolfazl Asudeh and H. V. Jagadish", title = "Fairly evaluating and scoring items in a data set", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "3445--3448", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3415478.3415566", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:06:01 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We frequently compute a score for each item in a data set, sometimes for its intrinsic value, but more often as a step towards classification, ranking, and so forth. The importance of computing this score fairly cannot be overstated. In this tutorial, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2020:SVD, author = "Meihui Zhang and Zhongle Xie and Cong Yue and Ziyue Zhong", title = "{Spitz}: a verifiable database system", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "3449--3460", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3415478.3415567", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:06:01 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Databases in the past have helped businesses maintain and extract insights from their data. Today, it is common for a business to involve multiple independent, distrustful parties. This trend towards decentralization introduces a new and important \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Melnik:2020:DDI, author = "Sergey Melnik and Andrey Gubarev and Jing Jing Long and Geoffrey Romer and Shiva Shivakumar and Matt Tolton and Theo Vassilakis and Hossein Ahmadi and Dan Delorey and Slava Min and Mosha Pasumansky and Jeff Shute", title = "{Dremel}: a decade of interactive {SQL} analysis at web scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "3461--3472", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3415478.3415568", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:06:01 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Google's Dremel was one of the first systems that combined a set of architectural principles that have become a common practice in today's cloud-native analytics tools, including disaggregated storage and compute, in situ analysis, and columnar storage \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ailamaki:2020:JWD, author = "Anastasia Ailamaki", title = "{JIT} works: decide when all data is known ({VLDB} women in database research award talk)", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "3473--3473", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3415478.3415569", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:06:01 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "When the time comes to make a critical decision, it is of paramount importance to prepare enough so that all the information necessary is available at decision time. Under-preparation leads to uninformed decisions; over-preparation, however, may lead to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Stoyanovich:2020:RDM, author = "Julia Stoyanovich and Bill Howe and H. V. Jagadish", title = "Responsible data management", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "3474--3488", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3415478.3415570", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:06:01 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The need for responsible data management intensifies with the growing impact of data on society. One central locus of the societal impact of data are Automated Decision Systems (ADS), socio-legal-technical systems that are used broadly in industry, non-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Goda:2020:OED, author = "Kazuo Goda and Yuto Hayamizu and Hiroyuki Yamada and Masaru Kitsuregawa", title = "Out-of-order execution of database queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "3489--3501", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3415478.3415571", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:06:01 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Intra-query parallelism is a key for database software to offer acceptable responsiveness for data-intensive queries. Many researchers have studied how to achieve greater execution parallelism for database queries. Partitioning is a representative \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Olteanu:2020:RDB, author = "Dan Olteanu", title = "The relational data borg is learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "3502--3515", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3415478.3415572", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:06:01 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper overviews an approach that addresses machine learning over relational data as a database problem. This is justified by two observations. First, the input to the learning task is commonly the result of a feature extraction query over the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bailis:2020:WSD, author = "Peter Bailis and Juliana Freire and Magda Balazinska and Raghu Ramakrishnan and Joseph M. Hellerstein and Xin Luna Dong and Michael Stonebraker", title = "Winds from {Seattle}: database research directions", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "12", pages = "3516--3516", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3415478.3415573", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:06:01 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The database research community has been notably successful in impacting the industry and academia since the invention of the relational model. Examples of innovation in the last decade include columnar storage for data analytic platforms, cloud data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zeng:2020:SBI, author = "Yuxiang Zeng and Yongxin Tong and Yuguang Song and Lei Chen", title = "The simpler the better: an indexing approach for shared-route planning queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "13", pages = "3517--3530", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3424573.3424574", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:02 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3424573.3424574", abstract = "Ridesharing services have gained global popularity as a convenient, economic, and sustainable transportation mode in recent years. One fundamental challenge in these services is planning the shared-routes ( i.e., sequences of origins and destinations) \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tanabe:2020:ACC, author = "Takayuki Tanabe and Takashi Hoshino and Hideyuki Kawashima and Osamu Tatebe", title = "An analysis of concurrency control protocols for in-memory databases with {CCBench}", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "13", pages = "3531--3544", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3424573.3424575", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:02 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3424573.3424575", abstract = "This paper presents yet another concurrency control analysis platform, CCBench. CCBench supports seven protocols (Silo, TicToc, MOCC, Cicada, SI, SI with latch-free SSN, 2PL) and seven versatile optimization methods and enables the configuration of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2020:IUS, author = "Tianhao Wang and Bolin Ding and Min Xu and Zhicong Huang and Cheng Hong and Jingren Zhou and Ninghui Li and Somesh Jha", title = "Improving utility and security of the shuffler-based differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "13", pages = "3545--3558", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3424573.3424576", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:02 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3424573.3424576", abstract = "When collecting information, local differential privacy (LDP) alleviates privacy concerns of users because their private information is randomized before being sent it to the central aggregator. LDP imposes large amount of noise as each user executes \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kipf:2020:CIL, author = "Andreas Kipf and Damian Chromejko and Alexander Hall and Peter Boncz and David G. Andersen", title = "Cuckoo index: a lightweight secondary index structure", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "13", pages = "3559--3572", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3424573.3424577", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:02 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3424573.3424577", abstract = "In modern data warehousing, data skipping is essential for high query performance. While index structures such as B-trees or hash tables allow for precise pruning, their large storage requirements make them impractical for indexing secondary columns. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Abebe:2020:MAP, author = "Michael Abebe and Brad Glasbergen and Khuzaima Daudjee", title = "{MorphoSys}: automatic physical design metamorphosis for distributed database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "13", pages = "3573--3587", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3424573.3424578", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:02 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3424573.3424578", abstract = "Distributed database systems are widely used to meet the demands of storing and managing computation-heavy workloads. To boost performance and minimize resource and data contention, these systems require selecting a distributed physical design that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Behnezhad:2020:PGA, author = "Soheil Behnezhad and Laxman Dhulipala and Hossein Esfandiari and Jakub Lacki and Vahab Mirrokni and Warren Schudy", title = "Parallel graph algorithms in constant adaptive rounds: theory meets practice", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "13", pages = "3588--3602", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3424573.3424579", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:02 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3424573.3424579", abstract = "We study fundamental graph problems such as graph connectivity, minimum spanning forest (MSF), and approximate maximum (weight) matching in a distributed setting. In particular, we focus on the Adaptive Massively Parallel Computation (AMPC) model, which \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2020:DLP, author = "Runhui Wang and Dong Deng", title = "{DeltaPQ}: lossless product quantization code compression for high dimensional similarity search", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "13", pages = "3603--3616", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3424573.3424580", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:02 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3424573.3424580", abstract = "High dimensional data is ubiquitous and plays an important role in many applications. However, the size of high dimensional data is usually excessively large. To alleviate this problem, in this paper, we develop novel techniques to compress and search \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Marcus:2020:BLI, author = "Ryan Marcus and Andreas Kipf and Alexander van Renen and Mihail Stoian and Sanchit Misra and Alfons Kemper and Thomas Neumann and Tim Kraska", title = "Benchmarking learned indexes", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "1", pages = "1--13", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3421424.3421425", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:02 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3421424.3421425", abstract = "Recent advancements in learned index structures propose replacing existing index structures, like B-Trees, with approximate learned models. In this work, we present a unified benchmark that compares well-tuned implementations of three learned index \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2020:TGC, author = "Zuozhi Wang and Kai Zeng and Botong Huang and Wei Chen and Xiaozong Cui and Bo Wang and Ji Liu and Liya Fan and Dachuan Qu and Zhenyu Hou and Tao Guan and Chen Li and Jingren Zhou", title = "{Tempura}: a general cost-based optimizer framework for incremental data processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "1", pages = "14--27", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3421424.3421427", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:02 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3421424.3421427", abstract = "Incremental processing is widely-adopted in many applications, ranging from incremental view maintenance, stream computing, to recently emerging progressive data warehouse and intermittent query processing. Despite many algorithms developed on this \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Heo:2020:IGD, author = "Geon Heo and Yuji Roh and Seonghyeon Hwang and Dayun Lee and Steven Euijong Whang", title = "Inspector gadget: a data programming-based labeling system for industrial images", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "1", pages = "28--36", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3421424.3421429", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:02 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3421424.3421429", abstract = "As machine learning for images becomes democratized in the Software 2.0 era, one of the serious bottlenecks is securing enough labeled data for training. This problem is especially critical in a manufacturing setting where smart factories rely on \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2020:SAN, author = "Renchi Yang and Jieming Shi and Xiaokui Xiao and Yin Yang and Juncheng Liu and Sourav S. Bhowmick", title = "Scaling attributed network embedding to massive graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "1", pages = "37--49", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3421424.3421430", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:02 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3421424.3421430", abstract = "Given a graph G where each node is associated with a set of attributes, attributed network embedding (ANE) maps each node $ v \in G $ to a compact vector X$_v$, which can be used in downstream machine learning tasks. Ideally, $ X_v$ should capture node $v$'s affinity. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2020:DEM, author = "Yuliang Li and Jinfeng Li and Yoshihiko Suhara and AnHai Doan and Wang-Chiew Tan", title = "Deep entity matching with pre-trained language models", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "1", pages = "50--60", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3421424.3421431", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:02 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3421424.3421431", abstract = "We present Ditto, a novel entity matching system based on pre-trained Transformer-based language models. We fine-tune and cast EM as a sequence-pair classification problem to leverage such models with a simple architecture. Our experiments show that a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2020:NOC, author = "Zongheng Yang and Amog Kamsetty and Sifei Luan and Eric Liang and Yan Duan and Xi Chen and Ion Stoica", title = "{NeuroCard}: one cardinality estimator for all tables", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "1", pages = "61--73", month = sep, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3421424.3421432", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:02 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3421424.3421432", abstract = "Query optimizers rely on accurate cardinality estimates to produce good execution plans. Despite decades of research, existing cardinality estimators are inaccurate for complex queries, due to making lossy modeling assumptions and not capturing inter-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ding:2020:TLM, author = "Jialin Ding and Vikram Nathan and Mohammad Alizadeh and Tim Kraska", title = "{Tsunami}: a learned multi-dimensional index for correlated data and skewed workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "2", pages = "74--86", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3425879.3425880", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:03 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3425879.3425880", abstract = "Filtering data based on predicates is one of the most fundamental operations for any modern data warehouse. Techniques to accelerate the execution of filter expressions include clustered indexes, specialized sort orders (e.g., Z-order), multi-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kang:2020:JOP, author = "Daniel Kang and Ankit Mathur and Teja Veeramacheneni and Peter Bailis and Matei Zaharia", title = "Jointly optimizing preprocessing and inference for {DNN}-based visual analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "2", pages = "87--100", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3425879.3425881", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:03 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3425879.3425881", abstract = "While deep neural networks (DNNs) are an increasingly popular way to query large corpora of data, their significant runtime remains an active area of research. As a result, researchers have proposed systems and optimizations to reduce these costs by \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Menon:2020:PCQ, author = "Prashanth Menon and Amadou Ngom and Lin Ma and Todd C. Mowry and Andrew Pavlo", title = "Permutable compiled queries: dynamically adapting compiled queries without recompiling", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "2", pages = "101--113", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3425879.3425882", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:03 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3425879.3425882", abstract = "Just-in-time (JIT) query compilation is a technique to improve analytical query performance in database management systems (DBMSs). But the cost of compiling each query can be significant relative to its execution time. This overhead prohibits the DBMS \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Min:2020:EEM, author = "Seung Won Min and Vikram Sharma Mailthody and Zaid Qureshi and Jinjun Xiong and Eiman Ebrahimi and Wen-mei Hwu", title = "{EMOGI}: efficient memory-access for out-of-memory graph-traversal in {GPUs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "2", pages = "114--127", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3425879.3425883", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:03 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3425879.3425883", abstract = "Modern analytics and recommendation systems are increasingly based on graph data that capture the relations between entities being analyzed. Practical graphs come in huge sizes, offer massive parallelism, and are stored in sparse-matrix formats such as \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2020:SFA, author = "Yinda Zhang and Jinyang Li and Yutian Lei and Tong Yang and Zhetao Li and Gong Zhang and Bin Cui", title = "On-off sketch: a fast and accurate sketch on persistence", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "2", pages = "128--140", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3425879.3425884", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:03 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3425879.3425884", abstract = "Approximate stream processing has attracted much attention recently. Prior art mostly focuses on characteristics like frequency, cardinality, and quantile. Persistence, as a new characteristic, is getting increasing attention. Unlike frequency, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tran:2020:RTD, author = "Luan Tran and Min Y. Mun and Cyrus Shahabi", title = "Real-time distance-based outlier detection in data streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "2", pages = "141--153", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3425879.3425885", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:03 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3425879.3425885", abstract = "Real-time outlier detection in data streams has drawn much attention recently as many applications need to be able to detect abnormal behaviors as soon as they occur. The arrival and departure of streaming data on edge devices impose new challenges to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Poppe:2020:SIL, author = "Olga Poppe and Tayo Amuneke and Dalitso Banda and Aritra De and Ari Green and Manon Knoertzer and Ehi Nosakhare and Karthik Rajendran and Deepak Shankargouda and Meina Wang and Alan Au and Carlo Curino and Qun Guo and Alekh Jindal and Ajay Kalhan and Morgan Oslake and Sonia Parchani and Vijay Ramani and Raj Sellappan and Saikat Sen and Sheetal Shrotri and Soundararajan Srinivasan and Ping Xia and Shize Xu and Alicia Yang and Yiwen Zhu", title = "{Seagull}: an infrastructure for load prediction and optimized resource allocation", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "2", pages = "154--162", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3425879.3425886", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:03 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3425879.3425886", abstract = "Microsoft Azure is dedicated to guarantee high quality of service to its customers, in particular, during periods of high customer activity, while controlling cost. We employ a Data Science (DS) driven solution to predict user load and leverage these \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2020:EKM, author = "Sheng Wang and Yuan Sun and Zhifeng Bao", title = "On the efficiency of {K-means} clustering: evaluation, optimization, and algorithm selection", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "2", pages = "163--175", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3425879.3425887", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:03 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3425879.3425887", abstract = "This paper presents a thorough evaluation of the existing methods that accelerate Lloyd's algorithm for fast k -means clustering. To do so, we analyze the pruning mechanisms of existing methods, and summarize their common pipeline into a unified \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2020:RHA, author = "Shixuan Sun and Xibo Sun and Yulin Che and Qiong Luo and Bingsheng He", title = "{RapidMatch}: a holistic approach to subgraph query processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "2", pages = "176--188", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3425879.3425888", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:03 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3425879.3425888", abstract = "A subgraph query searches for all embeddings in a data graph that are identical to a query graph. Two kinds of algorithms, either graph exploration based or join based, have been developed for processing subgraph queries. Due to algorithmic and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xia:2020:TLP, author = "Yu Xia and Xiangyao Yu and Andrew Pavlo and Srinivas Devadas", title = "{Taurus}: lightweight parallel logging for in-memory database management systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "2", pages = "189--201", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3425879.3425889", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:03 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3425879.3425889", abstract = "Existing single-stream logging schemes are unsuitable for in-memory database management systems (DBMSs) as the single log is often a performance bottleneck. To overcome this problem, we present Taurus, an efficient parallel logging scheme that uses \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Paul:2020:IEE, author = "Johns Paul and Bingsheng He and Shengliang Lu and Chiew Tong Lau", title = "Improving execution efficiency of just-in-time compilation based query processing on {GPUs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "2", pages = "202--214", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3425879.3425890", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:03 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3425879.3425890", abstract = "In recent years, we have witnessed significant efforts to improve the performance of Online Analytical Processing (OLAP) on graphics processing units (GPUs). Most existing studies have focused on improving memory efficiency since memory stalls can play \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2020:PTS, author = "Shuang Wang and Hakan Ferhatosmanoglu", title = "{PPQ}-trajectory: spatio-temporal quantization for querying in large trajectory repositories", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "2", pages = "215--227", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3425879.3425891", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:03 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3425879.3425891", abstract = "We present PPQ-trajectory, a spatio-temporal quantization based solution for querying large dynamic trajectory data. PPQ-trajectory includes a partition-wise predictive quantizer (PPQ) that generates an error-bounded codebook with autocorrelation and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hu:2020:ADP, author = "Xiao Hu and Shouzhuo Sun and Shweta Patwa and Debmalya Panigrahi and Sudeepa Roy", title = "Aggregated deletion propagation for counting conjunctive query answers", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "2", pages = "228--240", month = oct, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3425879.3425892", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:03 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3425879.3425892", abstract = "We investigate the computational complexity of minimizing the source side-effect in order to remove a given number of tuples from the output of a conjunctive query. This is a variant of the well-studied deletion propagation problem, the difference being \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Luo:2020:BMW, author = "Chen Luo and Michael J. Carey", title = "Breaking down memory walls: adaptive memory management in {LSM}-based storage systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "241--254", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442425", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442425", abstract = "Log-Structured Merge-trees (LSM-trees) have been widely used in modern NoSQL systems. Due to their out-of-place update design, LSM-trees have introduced memory walls among the memory components of multiple LSM-trees and between the write memory and the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Karlas:2020:NNC, author = "Bojan Karlas and Peng Li and Renzhi Wu and Nezihe Merve G{\"u}rel and Xu Chu and Wentao Wu and Ce Zhang", title = "Nearest neighbor classifiers over incomplete information: from certain answers to certain predictions", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "255--267", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442426", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442426", abstract = "Machine learning (ML) applications have been thriving recently, largely attributed to the increasing availability of data. However, inconsistency and incomplete information are ubiquitous in real-world datasets, and their impact on ML applications \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kingsbury:2020:EII, author = "Kyle Kingsbury and Peter Alvaro", title = "{Elle}: inferring isolation anomalies from experimental observations", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "268--280", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442427", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442427", abstract = "Users who care about their data store it in databases, which (at least in principle) guarantee some form of transactional isolation. However, experience shows that many databases do not provide the isolation guarantees they claim. With the recent \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kiefer:2020:SGF, author = "Martin Kiefer and Ilias Poulakis and Sebastian Bre{\ss} and Volker Markl", title = "{Scotch}: generating {FPGA}-accelerators for sketching at line rate", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "281--293", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442428", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442428", abstract = "Sketching algorithms are a powerful tool for single-pass data summarization. Their numerous applications include approximate query processing, machine learning, and large-scale network monitoring. In the presence of high-bandwidth interconnects or in-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Khayati:2020:OOR, author = "Mourad Khayati and Ines Arous and Zakhar Tymchenko and Philippe Cudr{\'e}-Mauroux", title = "{ORBITS}: online recovery of missing values in multiple time series streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "294--306", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442429", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442429", abstract = "With the emergence of the Internet of Things (IoT), time series streams have become ubiquitous in our daily life. Recording such data is rarely a perfect process, as sensor failures frequently occur, yielding occasional blocks of data that go missing in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Deng:2020:TTU, author = "Xiang Deng and Huan Sun and Alyssa Lees and You Wu and Cong Yu", title = "{TURL}: table understanding through representation learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "307--319", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442430", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442430", abstract = "Relational tables on the Web store a vast amount of knowledge. Owing to the wealth of such tables, there has been tremendous progress on a variety of tasks in the area of table understanding. However, existing work generally relies on heavily-engineered \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Guo:2020:EUD, author = "Long Guo and Lifeng Hua and Rongfei Jia and Fei Fang and Binqiang Zhao and Bin Cui", title = "{EdgeDIPN}: a unified deep intent prediction network deployed at the edge", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "320--328", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442431", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442431", abstract = "With the rapid growth of e-commerce in recent years, e-commerce platforms are becoming a primary place for people to find, compare and ultimately purchase products. To improve online shopping experience for consumers and increase sales for sellers, it \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2020:LCW, author = "Yiming Lin and Daokun Jiang and Roberto Yus and Georgios Bouloukakis and Andrew Chio and Sharad Mehrotra and Nalini Venkatasubramanian", title = "{Locater}: cleaning wifi connectivity datasets for semantic localization", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "329--341", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442432", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442432", abstract = "This paper explores the data cleaning challenges that arise in using WiFi connectivity data to locate users to semantic indoor locations such as buildings, regions, rooms. WiFi connectivity data consists of sporadic connections between devices and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2020:MMT, author = "Hao Liu and Jindong Han and Yanjie Fu and Jingbo Zhou and Xinjiang Lu and Hui Xiong", title = "Multi-modal transportation recommendation with unified route representation learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "342--350", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442433", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442433", abstract = "Multi-modal transportation recommendation aims to provide the most appropriate travel route with various transportation modes according to certain criteria. After analyzing large-scale navigation data, we find that route representations exhibit two \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2020:DDF, author = "Yue Wang and Ruiqi Xu and Zonghao Feng and Yulin Che and Lei Chen and Qiong Luo and Rui Mao", title = "{Disk}: a distributed framework for single-source {SimRank} with accuracy guarantee", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "351--363", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442434", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442434", abstract = "Measuring similarities among different nodes is important in graph analysis. SimRank is one of the most popular similarity measures. Given a graph G ( V, E ) and a source node u, a single-source Sim-Rank query returns the similarities between u and each \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Didona:2020:TBU, author = "Diego Didona and Nikolas Ioannou and Radu Stoica and Kornilios Kourtis", title = "Toward a better understanding and evaluation of tree structures on flash {SSDs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "364--377", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442435", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442435", abstract = "Solid-state drives (SSDs) are extensively used to deploy persistent data stores, as they provide low latency random access, high write throughput, high data density, and low cost. Tree-based data structures are widely used to build persistent data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2020:AMD, author = "Jianyu Yang and Tianhao Wang and Ninghui Li and Xiang Cheng and Sen Su", title = "Answering multi-dimensional range queries under local differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "378--390", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442436", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442436", abstract = "In this paper, we tackle the problem of answering multi-dimensional range queries under local differential privacy. There are three key technical challenges: capturing the correlations among attributes, avoiding the curse of dimensionality, and dealing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Palyvos-Giannas:2020:ASF, author = "Dimitris Palyvos-Giannas and Bastian Havers and Marina Papatriantafilou and Vincenzo Gulisano", title = "{Ananke}: a streaming framework for live forward provenance", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "391--403", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442437", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442437", abstract = "Data streaming enables online monitoring of large and continuous event streams in Cyber-Physical Systems (CPSs). In such scenarios, fine-grained backward provenance tools can connect streaming query results to the source data producing them, allowing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lakhotia:2020:RRC, author = "Kartik Lakhotia and Rajgopal Kannan and Viktor Prasanna and Cesar A. F. {De Rose}", title = "{Receipt}: refine coarse-grained independent tasks for parallel tip decomposition of bipartite graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "404--417", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442438", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442438", abstract = "Tip decomposition is a crucial kernel for mining dense subgraphs in bipartite networks, with applications in spam detection, analysis of affiliation networks etc. It creates a hierarchy of vertex-induced subgraphs with varying densities determined by \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Deep:2020:CEW, author = "Shaleen Deep and Anja Gruenheid and Paraschos Koutris and Jeffrey Naughton and Stratis Viglas", title = "Comprehensive and efficient workload compression", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "418--430", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442439", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442439", abstract = "This work studies the problem of constructing a representative workload from a given input analytical query workload where the former serves as an approximation with guarantees of the latter. We discuss our work in the context of workload analysis and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{He:2020:CCO, author = "Yongjun He and Jiacheng Lu and Tianzheng Wang", title = "{CoroBase}: coroutine-oriented main-memory database engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "431--444", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442440", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442440", abstract = "Data stalls are a major overhead in main-memory database engines due to the use of pointer-rich data structures. Lightweight coroutines ease the implementation of software prefetching to hide data stalls by overlapping computation and asynchronous data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Smith:2020:SQN, author = "Jaclyn Smith and Michael Benedikt and Milos Nikolic and Amir Shaikhha", title = "Scalable querying of nested data", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "3", pages = "445--457", month = nov, year = "2020", CODEN = "????", DOI = "https://doi.org/10.5555/3430915.3442441", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 15 05:34:04 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.5555/3430915.3442441", abstract = "While large-scale distributed data processing platforms have become an attractive target for query processing, these systems are problematic for applications that deal with nested collections. Programmers are forced either to perform non-trivial \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gong:2020:SCE, author = "Long Gong and Ziheng Liu and Liang Liu and Jun Xu and Mitsunori Ogihara and Tong Yang", title = "Space- and computationally-efficient set reconciliation via parity bitmap sketch {(PBS)}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "458--470", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436906", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436906", abstract = "Set reconciliation is a fundamental algorithmic problem that arises in many networking, system, and database applications. In this problem, two large sets A and B of objects (bitcoins, files, records, etc.) are stored respectively at two different \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shetiya:2020:AAS, author = "Suraj Shetiya and Saravanan Thirumuruganathan and Nick Koudas and Gautam Das", title = "{Astrid}: accurate selectivity estimation for string predicates using deep learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "471--484", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436907", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436907", abstract = "Accurate selectivity estimation for string predicates is a long-standing research challenge in databases. Supporting pattern matching on strings (such as prefix, substring, and suffix) makes this problem much more challenging, thereby necessitating a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zheng:2020:CTR, author = "Nan Zheng and Zachary G. Ives", title = "Compact, tamper-resistant archival of fine-grained provenance", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "485--497", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436909", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436909", abstract = "Data provenance tools aim to facilitate reproducible data science and auditable data analyses, by tracking the processes and inputs responsible for each result of an analysis. Fine-grained provenance further enables sophisticated reasoning about why \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Muller:2020:RDI, author = "Ingo M{\"u}ller and Ghislain Fourny and Stefan Irimescu and Can Berker Cikis and Gustavo Alonso", title = "{Rumble}: data independence for large messy data sets", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "498--506", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436910", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436910", abstract = "This paper introduces Rumble, a query execution engine for large, heterogeneous, and nested collections of JSON objects built on top of Apache Spark. While data sets of this type are more and more wide-spread, most existing tools are built around a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chapman:2020:CQF, author = "Adriane Chapman and Paolo Missier and Giulia Simonelli and Riccardo Torlone", title = "Capturing and querying fine-grained provenance of preprocessing pipelines in data science", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "507--520", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436911", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436911", abstract = "Data processing pipelines that are designed to clean, transform and alter data in preparation for learning predictive models, have an impact on those models' accuracy and performance, as well on other properties, such as model fairness. It is therefore \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Farias:2020:LDD, author = "Victor A. E. Farias and Felipe T. Brito and Cheryl Flynn and Javam C. Machado and Subhabrata Majumdar and Divesh Srivastava", title = "Local dampening: differential privacy for non-numeric queries via local sensitivity", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "521--533", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436912", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436912", abstract = "Differential privacy is the state-of-the-art formal definition for data release under strong privacy guarantees. A variety of mechanisms have been proposed in the literature for releasing the noisy output of numeric queries (e.g., using the Laplace \ldots{}).", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2020:MDS, author = "Tianyu Li and Matthew Butrovich and Amadou Ngom and Wan Shen Lim and Wes McKinney and Andrew Pavlo", title = "Mainlining databases: supporting fast transactional workloads on universal columnar data file formats", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "534--546", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436913", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436913", abstract = "The proliferation of modern data processing tools has given rise to open-source columnar data formats. These formats help organizations avoid repeated conversion of data to a new format for each application. However, these formats are read-only, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lu:2020:AEC, author = "Shengliang Lu and Bingsheng He and Yuchen Li and Hao Fu", title = "Accelerating exact constrained shortest paths on {GPUs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "547--559", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436914", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436914", abstract = "The recently emerging applications such as software-defined networks and autonomous vehicles require efficient and exact solutions for constrained shortest paths (CSP), which finds the shortest path in a graph while satisfying some user-defined \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mo:2020:TEW, author = "Songsong Mo and Zhifeng Bao and Ping Zhang and Zhiyong Peng", title = "Towards an efficient weighted random walk domination", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "560--572", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436915", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436915", abstract = "In this paper, we propose and study a new problem called the weighted random walk domination. Given a weighted graph G ( V, E ) and a budget B of the weighted random walk, it aims to find a k -size set S, which can minimize the total costs of the remaining \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Guo:2020:SMM, author = "Guimu Guo and Da Yan and M. Tamer {\"O}zsu and Zhe Jiang and Jalal Khalil", title = "Scalable mining of maximal quasi-cliques: an algorithm-system codesign approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "573--585", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436916", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436916", abstract = "Given a user-specified minimum degree threshold $ \gamma $, a $ \gamma $-quasiclique is a subgraph $ g = (V_g, E_g)$ where each vertex $ \nu \in V_g$ connects to at least $ \gamma $ fraction of the other vertices (i.e., $ \lceil \gamma \cdot (|V_g| - 1) \rceil $ vertices) in $g$. Quasi-clique is one of the most natural \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kokoris-Kogias:2020:CPD, author = "Eleftherios Kokoris-Kogias and Enis Ceyhun Alp and Linus Gasser and Philipp Jovanovic and Ewa Syta and Bryan Ford", title = "{CALYPSO}: private data management for decentralized ledgers", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "586--599", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436917", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436917", abstract = "Distributed ledgers provide high availability and integrity, making them a key enabler for practical and secure computation of distributed workloads among mutually distrustful parties. Many practical applications also require strong confidentiality, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Deeds:2020:SFL, author = "Kyle Deeds and Brian Hentschel and Stratos Idreos", title = "Stacked filters: learning to filter by structure", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "600--612", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436919", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436919", abstract = "We present Stacked Filters, a new probabilistic filter which is fast and robust similar to query-agnostic filters (such as Bloom and Cuckoo filters), and at the same time brings low false positive rates and sizes similar to classifier-based filters \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Banerjee:2020:MSW, author = "Prithu Banerjee and Wei Chen and Laks V. S. Lakshmanan", title = "Maximizing social welfare in a competitive diffusion model", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "613--625", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436920", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436920", abstract = "Influence maximization (IM) has garnered a lot of attention in the literature owing to applications such as viral marketing and infection containment. It aims to select a small number of seed users to adopt an item such that adoption propagates to a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gugnani:2020:UIR, author = "Shashank Gugnani and Arjun Kashyap and Xiaoyi Lu", title = "Understanding the idiosyncrasies of real persistent memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "626--639", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436921", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436921", abstract = "High capacity persistent memory (PMEM) is finally commercially available in the form of Intel's Optane DC Persistent Memory Module (DCPMM). Researchers have raced to evaluate and understand the performance of DCPMM itself as well as systems and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gale:2020:EMR, author = "Abraham Gale and Am{\'e}lie Marian", title = "Explaining monotonic ranking functions", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "640--652", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436922", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436922", abstract = "Ranking functions are commonly used to assist in decision-making in a wide variety of applications. As the general public realizes the significant societal impacts of the widespread use of algorithms in decision-making, there has been a push towards \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dhulipala:2020:CFS, author = "Laxman Dhulipala and Changwan Hong and Julian Shun", title = "{ConnectIt}: a framework for static and incremental parallel graph connectivity algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "653--667", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436923", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436923", abstract = "Connected components is a fundamental kernel in graph applications. The fastest existing multicore algorithms for solving graph connectivity are based on some form of edge sampling and/or linking and compressing trees. However, many combinations of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kouadri:2020:QSA, author = "Wissam Mammar Kouadri and Mourad Ouziri and Salima Benbernou and Karima Echihabi and Themis Palpanas and Iheb {Ben Amor}", title = "Quality of sentiment analysis tools: the reasons of inconsistency", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "668--681", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436924", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436924", abstract = "In this paper, we present a comprehensive study that evaluates six state-of-the-art sentiment analysis tools on five public datasets, based on the quality of predictive results in the presence of semantically equivalent documents, i.e., how consistent \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Garcia:2020:HLM, author = "Rolando Garcia and Eric Liu and Vikram Sreekanti and Bobby Yan and Anusha Dandamudi and Joseph E. Gonzalez and Joseph M. Hellerstein and Koushik Sen", title = "Hindsight logging for model training", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "682--693", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436925", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436925", abstract = "In modern Machine Learning, model training is an iterative, experimental process that can consume enormous computation resources and developer time. To aid in that process, experienced model developers log and visualize program variables during training \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jiang:2020:SSI, author = "Lin Jiang and Junqiao Qiu and Zhijia Zhao", title = "Scalable structural index construction for {JSON} analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "694--707", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436926", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2020.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436926", abstract = "JavaScript Object Notation (JSON) and its variants have gained great popularity in recent years. Unfortunately, the performance of their analytics is often dragged down by the expensive JSON parsing. To address this, recent work has shown that building \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rui:2020:EJA, author = "Ran Rui and Hao Li and Yi-Cheng Tu", title = "Efficient join algorithms for large database tables in a multi-{GPU} environment", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "4", pages = "708--720", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.14778/3436905.3436927", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Feb 23 08:32:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3436905.3436927", abstract = "Relational join processing is one of the core functionalities in database management systems. It has been demonstrated that GPUs as a general-purpose parallel computing platform is very promising in processing relational joins. However, join algorithms \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yan:2021:FAP, author = "Shuyuan Yan and Bolin Ding and Wei Guo and Jingren Zhou and Zhewei Wei and Xiaowei Jiang and Sheng Xu", title = "{FlashP}: an analytical pipeline for real-time forecasting of time-series relational data", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "5", pages = "721--729", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3446095.3446096", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:29:44 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3446095.3446096", abstract = "Interactive response time is important in analytical pipelines for users to explore a sufficient number of possibilities and make informed business decisions. We consider a forecasting pipeline with large volumes of high-dimensional time series data. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Duong:2021:ESS, author = "Chi Thang Duong and Trung Dung Hoang and Hongzhi Yin and Matthias Weidlich and Quoc Viet Hung Nguyen and Karl Aberer", title = "Efficient streaming subgraph isomorphism with graph neural networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "5", pages = "730--742", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3446095.3446097", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:29:44 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3446095.3446097", abstract = "Queries to detect isomorphic subgraphs are important in graph-based data management. While the problem of subgraph isomorphism search has received considerable attention for the static setting of a single query, or a batch thereof, existing approaches \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lu:2021:EBC, author = "Yi Lu and Xiangyao Yu and Lei Cao and Samuel Madden", title = "Epoch-based commit and replication in distributed {OLTP} databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "5", pages = "743--756", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3446095.3446098", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:29:44 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3446095.3446098", abstract = "Many modern data-oriented applications are built on top of distributed OLTP databases for both scalability and high availability. Such distributed databases enforce atomicity, durability, and consistency through two-phase commit (2PC) and synchronous \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2021:HCM, author = "Zhe Lin and Fan Zhang and Xuemin Lin and Wenjie Zhang and Zhihong Tian", title = "Hierarchical core maintenance on large dynamic graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "5", pages = "757--770", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3446095.3446099", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:29:44 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3446095.3446099", abstract = "The model of k -core and its decomposition have been applied in various areas, such as social networks, the world wide web, and biology. A graph can be decomposed into an elegant k -core hierarchy to facilitate cohesive subgraph discovery and network \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mohan:2021:AMD, author = "Jayashree Mohan and Amar Phanishayee and Ashish Raniwala and Vijay Chidambaram", title = "Analyzing and mitigating data stalls in {DNN} training", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "5", pages = "771--784", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3446095.3446100", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:29:44 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3446095.3446100", abstract = "Training Deep Neural Networks (DNNs) is resource-intensive and time-consuming. While prior research has explored many different ways of reducing DNN training time, the impact of input data pipeline, i.e., fetching raw data items from storage and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hu:2021:PMH, author = "Daokun Hu and Zhiwen Chen and Jianbing Wu and Jianhua Sun and Hao Chen", title = "Persistent memory hash indexes: an experimental evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "5", pages = "785--798", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3446095.3446101", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:29:44 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3446095.3446101", abstract = "Persistent memory (PM) is increasingly being leveraged to build hash-based indexing structures featuring cheap persistence, high performance, and instant recovery, especially with the recent release of Intel Optane DC Persistent Memory Modules. However, most of them are evaluated on DRAM-based emulators with unreal assumptions, or focus on the evaluation of specific metrics with important properties sidestepped. Thus, it is essential to understand how well the proposed hash indexes perform on real PM and how they differentiate from each other if a wider range of performance metrics are considered. To this end, this paper provides a comprehensive evaluation of persistent hash tables. In particular, we focus on the evaluation of six state-of-the-art hash tables including Level hashing, CCEH, Dash, PCLHT, Clevel, and SOFT, with real PM hardware. Our evaluation was conducted using a unified benchmarking framework and representative workloads. Besides characterizing common performance properties, we also explore how hardware configurations (such as PM bandwidth, CPU instructions, and NUMA) affect the performance of PM-based hash tables. With our in-depth analysis, we identify design trade-offs and good paradigms in prior arts, and suggest desirable optimizations and directions for the future development of PM-based hash tables.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2021:OMD, author = "Cheng Chen and Jun Yang and Mian Lu and Taize Wang and Zhao Zheng and Yuqiang Chen and Wenyuan Dai and Bingsheng He and Weng-Fai Wong and Guoan Wu and Yuping Zhao and Andy Rudoff", title = "Optimizing in-memory database engine for {AI}-powered on-line decision augmentation using persistent memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "5", pages = "799--812", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3446095.3446102", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:29:44 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3446095.3446102", abstract = "On-line decision augmentation (OLDA) has been considered as a promising paradigm for real-time decision making powered by Artificial Intelligence (AI). OLDA has been widely used in many applications such as real-time fraud detection, personalized \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Usta:2021:DMT, author = "Arif Usta and Akifhan Karakayali and {\"O}zg{\"u}r Ulusoy", title = "{DBTagger}: multi-task learning for keyword mapping in {NLIDBs} using bi-directional recurrent neural networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "5", pages = "813--821", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3446095.3446103", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:29:44 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3446095.3446103", abstract = "Translating Natural Language Queries (NLQs) to Structured Query Language (SQL) in interfaces deployed in relational databases is a challenging task, which has been widely studied in database community recently. Conventional rule based systems utilize \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sarkhel:2021:IIE, author = "Ritesh Sarkhel and Arnab Nandi", title = "Improving information extraction from visually rich documents using visual span representations", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "5", pages = "822--834", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3446095.3446104", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:29:44 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3446095.3446104", abstract = "Along with textual content, visual features play an essential role in the semantics of visually rich documents. Information extraction (IE) tasks perform poorly on these documents if these visual cues are not taken into account. In this paper, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2021:ZHT, author = "Gang Liu and Leying Chen and Shimin Chen", title = "{Zen}: a high-throughput log-free {OLTP} engine for non-volatile main memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "5", pages = "835--848", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3446095.3446105", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:29:44 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3446095.3446105", abstract = "Emerging Nonvolatile memory (NVM) technologies like 3DX-point promise significant performance potential for OLTP databases. However, transactional databases need to be redesigned because the key assumptions that non-volatile storage is orders of magnitude slower than DRAM and only supports blocked-oriented access have changed. NVMs are byte-addressable and almost as fast as DRAM. The capacity of NVM is much (4-16x) larger than DRAM. Such NVM characteristics make it possible to build OLTP database entirely in NVM main memory.\par This paper studies the structure of OLTP engines with hybrid NVM and DRAM memory. We observe three challenges to design an OLTP engine for NVM: tuple metadata modifications, NVM write redundancy, and NVM space management. We propose Zen, a high-throughput log-free OLTP engine for NVM. Zen addresses the three design challenges with three novel techniques: metadata enhanced tuple cache, log-free persistent transactions, and light-weight NVM space management. Experimental results on a real machine equipped with Intel Optane DC Persistent Memory show that Zen achieves up to 10.1x improvement compared with existing solutions to run an OLTP database as large as the size of NVM while achieving fast failure recovery.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ji:2021:DPB, author = "Tianxi Ji and Pan Li and Emre Yilmaz and Erman Ayday and Yanfang (Fanny) Ye and Jinyuan Sun", title = "Differentially private binary- and matrix-valued data query: an {XOR} mechanism", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "5", pages = "849--862", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3446095.3446106", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 24 11:29:44 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3446095.3446106", abstract = "Differential privacy has been widely adopted to release continuous- and scalar-valued information on a database without compromising the privacy of individual data records in it. The problem of querying binary- and matrix-valued information on a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Nakandala:2021:ECD, author = "Supun Nakandala and Yuhao Zhang and Arun Kumar", title = "Errata for {``Cerebro: a data system for optimized deep learning model selection''}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "863--863", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447691", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", note = "See \cite{Nakandala:2020:CDS}.", URL = "https://dl.acm.org/doi/10.14778/3447689.3447691", abstract = "We discovered that there was an inconsistency in the communication cost formulation for the decentralized fine-grained training method in Table 2 of our paper [1]. We used Horovod as the archetype for decentralized fine-grained approaches, and its \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yin:2021:PBD, author = "Lujia Yin and Yiming Zhang and Zhaoning Zhang and Yuxing Peng and Peng Zhao", title = "{ParaX}: boosting deep learning for big data analytics on many-core {CPUs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "864--877", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447692", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447692", abstract = "Despite the fact that GPUs and accelerators are more efficient in deep learning (DL), commercial clouds like Facebook and Amazon now heavily use CPUs in DL computation because there are large numbers of CPUs which would otherwise sit idle during off-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cai:2021:OTF, author = "Walter Cai and Philip A. Bernstein and Wentao Wu and Badrish Chandramouli", title = "Optimization of threshold functions over streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "878--889", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447693", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447693", abstract = "A common stream processing application is alerting, where the data stream management system (DSMS) continuously evaluates a threshold function over incoming streams. If the threshold is crossed, the DSMS raises an alarm. The threshold function is often \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2021:BCI, author = "Xuliang Zhu and Xin Huang and Byron Choi and Jiaxin Jiang and Zhaonian Zou and Jianliang Xu", title = "Budget constrained interactive search for multiple targets", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "890--902", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447694", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447694", abstract = "Interactive graph search leverages human intelligence to categorize target labels in a hierarchy, which is useful for image classification, product categorization, and database search. However, many existing interactive graph search studies aim at \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2021:SMK, author = "Yangjun Chen and Hoang Hai Nguyen", title = "On the string matching with $k$ differences in {DNA} databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "903--915", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447695", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447695", abstract = "In this paper, we discuss an efficient and effective index mechanism for the string matching with k differences, by which we will find all the substrings of a target string y of length n that align with a pattern string x of length m with not more than \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fujiwara:2021:FAA, author = "Yasuhiro Fujiwara and Sekitoshi Kanai and Yasutoshi Ida and Atsutoshi Kumagai and Naonori Ueda", title = "Fast algorithm for anchor graph hashing", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "916--928", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447696", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447696", abstract = "Anchor graph hashing is used in many applications such as cancer detection, web page classification, and drug discovery. It computes the hash codes from the eigenvectors of the matrix representing the similarities between data points and anchor points; \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2021:ACG, author = "Wangda Zhang and Junyoung Kim and Kenneth A. Ross and Eric Sedlar and Lukas Stadler", title = "Adaptive code generation for data-intensive analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "929--942", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447697", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447697", abstract = "Modern database management systems employ sophisticated query optimization techniques that enable the generation of efficient plans for queries over very large data sets. A variety of other applications also process large data sets, but cannot leverage \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tsamoura:2021:MKB, author = "Efthymia Tsamoura and David Carral and Enrico Malizia and Jacopo Urbani", title = "Materializing knowledge bases via trigger graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "943--956", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447699", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447699", abstract = "The chase is a well-established family of algorithms used to materialize Knowledge Bases (KBs) for tasks like query answering under dependencies or data cleaning. A general problem of chase algorithms is that they might perform redundant computations. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2021:DEE, author = "Jinfei Liu and Jian Lou and Junxu Liu and Li Xiong and Jian Pei and Jimeng Sun", title = "{Dealer}: an end-to-end model marketplace with differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "957--969", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447700", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447700", abstract = "Data-driven machine learning has become ubiquitous. A marketplace for machine learning models connects data owners and model buyers, and can dramatically facilitate data-driven machine learning applications. In this paper, we take a formal data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rahman:2021:NIS, author = "Sajjadur Rahman and Mangesh Bendre and Yuyang Liu and Shichu Zhu and Zhaoyuan Su and Karrie Karahalios and Aditya G. Parameswaran", title = "{NOAH}: interactive spreadsheet exploration with dynamic hierarchical overviews", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "970--983", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447701", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447701", abstract = "Spreadsheet systems are by far the most popular platform for data exploration on the planet, supporting millions of rows of data. However, exploring spreadsheets that are this large via operations such as scrolling or issuing formulae can be \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2021:EBT, author = "Yixing Yang and Yixiang Fang and Maria E. Orlowska and Wenjie Zhang and Xuemin Lin", title = "Efficient bi-triangle counting for large bipartite networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "984--996", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447702", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447702", abstract = "A bipartite network is a network with two disjoint vertex sets and its edges only exist between vertices from different sets. It has received much interest since it can be used to model the relationship between two different sets of objects in many \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tata:2021:GSE, author = "Sandeep Tata and Navneet Potti and James B. Wendt and Lauro Beltr{\~a}o Costa and Marc Najork and Beliz Gunel", title = "{Glean}: structured extractions from templatic documents", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "997--1005", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447703", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447703", abstract = "Extracting structured information from templatic documents is an important problem with the potential to automate many real-world business workflows such as payment, procurement, and payroll. The core challenge is that such documents can be laid out in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gao:2021:IGL, author = "Jun Gao and Jiazun Chen and Zhao Li and Ji Zhang", title = "{ICS-GNN}: lightweight interactive community search via graph neural network", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "1006--1018", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447704", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447704", abstract = "Searching a community containing a given query vertex in an online social network enjoys wide applications like recommendation, team organization, etc. When applied to real-life networks, the existing approaches face two major limitations. First, they \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2021:BEN, author = "Yuanyuan Sun and Sheng Wang and Huorong Li and Feifei Li", title = "Building enclave-native storage engines for practical encrypted databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "1019--1032", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447705", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447705", abstract = "Data confidentiality is one of the biggest concerns that hinders enterprise customers from moving their workloads to the cloud. Thanks to the trusted execution environment (TEE), it is now feasible to build encrypted databases in the enclave that can \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Thorne:2021:NLP, author = "James Thorne and Majid Yazdani and Marzieh Saeidi and Fabrizio Silvestri and Sebastian Riedel and Alon Halevy", title = "From natural language processing to neural databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "1033--1039", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447706", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447706", abstract = "In recent years, neural networks have shown impressive performance gains on long-standing AI problems, such as answering queries from text and machine translation. These advances raise the question of whether neural nets can be used at the core of query \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2021:RER, author = "Haibo Wang and Chaoyi Ma and Olufemi O. Odegbile and Shigang Chen and Jih-Kwon Peir", title = "Randomized error removal for online spread estimation in data streaming", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "1040--1052", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447707", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447707", abstract = "Measuring flow spread in real time from large, high-rate data streams has numerous practical applications, where a data stream is modeled as a sequence of data items from different flows and the spread of a flow is the number of distinct items in the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{DeLeo:2021:TAS, author = "Dean {De Leo} and Peter Boncz", title = "{Teseo} and the analysis of structural dynamic graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "1053--1066", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447708", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", note = "See errata \cite{Leo:2021:ETA}.", URL = "https://dl.acm.org/doi/10.14778/3447689.3447708", abstract = "We present Teseo, a new system for the storage and analysis of dynamic structural graphs in main-memory and the addition of transactional support. Teseo introduces a novel design based on sparse arrays, large arrays interleaved with gaps, and a fat tree,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gubner:2021:CDS, author = "Tim Gubner and Peter Boncz", title = "Charting the design space of query execution using {VOILA}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "1067--1079", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447709", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447709", abstract = "Database architecture, while having been studied for four decades now, has delivered only a few designs with well-understood properties. These few are followed by most actual systems. Acquiring more knowledge about the design space is a very time-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2021:HES, author = "Zhiqi Wang and Jin Xue and Zili Shao", title = "{Heracles}: an efficient storage model and data flushing for performance monitoring timeseries", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "1080--1092", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447710", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447710", abstract = "Performance-monitoring timeseries systems such as Prometheus and InfluxDB play a critical role in assuring reliability and operationally. These systems commonly adopt a column-oriented storage model, by which timeseries samples from different time-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Macke:2021:FGL, author = "Stephen Macke and Hongpu Gong and Doris Jung-Lin Lee and Andrew Head and Doris Xin and Aditya Parameswaran", title = "Fine-grained lineage for safer notebook interactions", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "1093--1101", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447712", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447712", abstract = "Computational notebooks have emerged as the platform of choice for data science and analytical workflows, enabling rapid iteration and exploration. By keeping intermediate program state in memory and segmenting units of execution into so-called ``cells'', \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tsitsulin:2021:FAG, author = "Anton Tsitsulin and Marina Munkhoeva and Davide Mottin and Panagiotis Karras and Ivan Oseledets and Emmanuel M{\"u}ller", title = "{FREDE}: anytime graph embeddings", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "1102--1110", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447713", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447713", abstract = "Low-dimensional representations, or embeddings, of a graph's nodes facilitate several practical data science and data engineering tasks. As such embeddings rely, explicitly or implicitly, on a similarity measure among nodes, they require the computation \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2021:AGM, author = "Xiaodong Li and Reynold Cheng and Kevin Chen-Chuan Chang and Caihua Shan and Chenhao Ma and Hongtai Cao", title = "On analyzing graphs with motif-paths", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "6", pages = "1111--1123", month = feb, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3447689.3447714", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:38 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3447689.3447714", abstract = "Path-based solutions have been shown to be useful for various graph analysis tasks, such as link prediction and graph clustering. However, they are no longer adequate for handling complex and gigantic graphs. Recently, motif-based analysis has attracted \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tsaras:2021:CIM, author = "Dimitris Tsaras and George Trimponias and Lefteris Ntaflos and Dimitris Papadias", title = "Collective influence maximization for multiple competing products with an awareness-to-influence model", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "7", pages = "1124--1136", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3450980.3450981", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:39 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3450980.3450981", abstract = "Influence maximization (IM) is a fundamental task in social network analysis. Typically, IM aims at selecting a set of seeds for the network that influences the maximum number of individuals. Motivated by practical applications, in this paper we focus \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2021:FGS, author = "Yahui Sun and Xiaokui Xiao and Bin Cui and Saman Halgamuge and Theodoros Lappas and Jun Luo", title = "Finding group {Steiner} trees in graphs with both vertex and edge weights", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "7", pages = "1137--1149", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3450980.3450982", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:39 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3450980.3450982", abstract = "Given an undirected graph and a number of vertex groups, the group Steiner trees problem is to find a tree such that (i) this tree contains at least one vertex in each vertex group; and (ii) the sum of vertex and edge weights in this tree is minimized. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Abeywickrama:2021:OBM, author = "Tenindra Abeywickrama and Victor Liang and Kian-Lee Tan", title = "Optimizing bipartite matching in real-world applications by incremental cost computation", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "7", pages = "1150--1158", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3450980.3450983", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:39 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3450980.3450983", abstract = "The Kuhn-Munkres (KM) algorithm is a classical combinatorial optimization algorithm that is widely used for minimum cost bipartite matching in many real-world applications, such as transportation. For example, a ride-hailing service may use it to find \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Trummer:2021:CNE, author = "Immanuel Trummer", title = "The case for {NLP}-enhanced database tuning: towards tuning tools that ``read the manual''", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "7", pages = "1159--1165", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3450980.3450984", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:39 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3450980.3450984", abstract = "A large body of knowledge on database tuning is available in the form of natural language text. We propose to leverage natural language processing (NLP) to make that knowledge accessible to automated tuning tools. We describe multiple avenues to exploit \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Maiyya:2021:EUC, author = "Sujaya Maiyya and Faisal Nawab and Divyakant Agrawal and Amr {El Abbadi}", title = "Errata for {``Unifying consensus and atomic commitment for effective cloud data management''}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "7", pages = "1166--1166", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3450980.3450985", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:39 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", note = "See \cite{Maiyya:2019:UCA}.", URL = "https://dl.acm.org/doi/10.14778/3450980.3450985", abstract = "This errata article discusses and corrects a minor error in our work published in VLDB 2019. The discrepancy specifically pertains to Algorithms 3 and 4. The algorithms presented in the paper are biased towards a commit decision in a specific failure \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Istvan:2021:SDD, author = "Zsolt Istv{\'a}n and Soujanya Ponnapalli and Vijay Chidambaram", title = "Software-defined data protection: low overhead policy compliance at the storage layer is within reach!", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "7", pages = "1167--1174", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3450980.3450986", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:39 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3450980.3450986", abstract = "Most modern data processing pipelines run on top of a distributed storage layer, and securing the whole system, and the storage layer in particular, against accidental or malicious misuse is crucial to ensuring compliance to rules and regulations. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2021:TRT, author = "Tianyi Li and Lu Chen and Christian S. Jensen and Torben Bach Pedersen", title = "{TRACE}: real-time compression of streaming trajectories in road networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "7", pages = "1175--1187", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3450980.3450987", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:39 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3450980.3450987", abstract = "The deployment of vehicle location services generates increasingly massive vehicle trajectory data, which incurs high storage and transmission costs. A range of studies target offline compression to reduce the storage cost. However, to enable online \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Saha:2021:SPC, author = "Arkaprava Saha and Ruben Brokkelkamp and Yllka Velaj and Arijit Khan and Francesco Bonchi", title = "Shortest paths and centrality in uncertain networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "7", pages = "1188--1201", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3450980.3450988", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:39 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3450980.3450988", abstract = "Computing the shortest path between a pair of nodes is a fundamental graph primitive, which has critical applications in vehicle routing, finding functional pathways in biological networks, survivable network design, among many others. In this work, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2021:ADAa, author = "Tongyu Liu and Ju Fan and Yinqing Luo and Nan Tang and Guoliang Li and Xiaoyong Du", title = "Adaptive data augmentation for supervised learning over missing data", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "7", pages = "1202--1214", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3450980.3450989", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:39 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3450980.3450989", abstract = "Real-world data is dirty, which causes serious problems in (supervised) machine learning (ML). The widely used practice in such scenario is to first repair the labeled source (a.k.a. train) data using rule-, statistical- or ML-based methods and then use \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2021:KPA, author = "Fuheng Zhao and Sujaya Maiyya and Ryan Wiener and Divyakant Agrawal and Amr {El Abbadi}", title = "{KLL$^\pm $} approximate quantile sketches over dynamic datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "7", pages = "1215--1227", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3450980.3450990", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:39 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3450980.3450990", abstract = "Recently the long standing problem of optimal construction of quantile sketches was resolved by Karnin, Lang, and Liberty using the KLL sketch (FOCS 2016). The algorithm for KLL is restricted to online insert operations and no delete operations. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jankov:2021:DNM, author = "Dimitrije Jankov and Binhang Yuan and Shangyu Luo and Chris Jermaine", title = "Distributed numerical and machine learning computations via two-phase execution of aggregated join trees", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "7", pages = "1228--1240", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3450980.3450991", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:39 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3450980.3450991", abstract = "When numerical and machine learning (ML) computations are expressed relationally, classical query execution strategies (hash-based joins and aggregations) can do a poor job distributing the computation. In this paper, we propose a two-phase execution \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{VanAken:2021:IML, author = "Dana {Van Aken} and Dongsheng Yang and Sebastien Brillard and Ari Fiorino and Bohan Zhang and Christian Bilien and Andrew Pavlo", title = "An inquiry into machine learning-based automatic configuration tuning services on real-world database management systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "7", pages = "1241--1253", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3450980.3450992", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Apr 13 13:43:39 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3450980.3450992", abstract = "Modern database management systems (DBMS) expose dozens of configurable knobs that control their runtime behavior. Setting these knobs correctly for an application's workload can improve the performance and efficiency of the DBMS. But because of their \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tang:2021:RRP, author = "Nan Tang and Ju Fan and Fangyi Li and Jianhong Tu and Xiaoyong Du and Guoliang Li and Sam Madden and Mourad Ouzzani", title = "{RPT}: relational pre-trained transformer is almost all you need towards democratizing data preparation", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "8", pages = "1254--1261", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3457390.3457391", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:31 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3457390.3457391", abstract = "Can AI help automate human-easy but computer-hard data preparation tasks that burden data scientists, practitioners, and crowd workers? We answer this question by presenting RPT, a denoising autoencoder for tuple-to-X models (`` X '' could be tuple, token, \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zou:2021:LAP, author = "Jia Zou and Amitabh Das and Pratik Barhate and Arun Iyengar and Binhang Yuan and Dimitrije Jankov and Chris Jermaine", title = "{Lachesis}: automatic partitioning for {UDF}-centric analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "8", pages = "1262--1275", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3457390.3457392", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:31 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3457390.3457392", abstract = "Partitioning is effective in avoiding expensive shuffling operations. However, it remains a significant challenge to automate this process for Big Data analytics workloads that extensively use user defined functions (UDFs), where sub-computations are \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2021:ULI, author = "Jiacheng Wu and Yong Zhang and Shimin Chen and Jin Wang and Yu Chen and Chunxiao Xing", title = "Updatable learned index with precise positions", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "8", pages = "1276--1288", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3457390.3457393", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:31 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3457390.3457393", abstract = "Index plays an essential role in modern database engines to accelerate the query processing. The new paradigm of ``learned index'' has significantly changed the way of designing index structures in DBMS. The key insight is that indexes could be regarded \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fang:2021:MMS, author = "Ziquan Fang and Lu Pan and Lu Chen and Yuntao Du and Yunjun Gao", title = "{MDTP}: a multi-source deep traffic prediction framework over spatio-temporal trajectory data", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "8", pages = "1289--1297", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3457390.3457394", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:31 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3457390.3457394", abstract = "Traffic prediction has drawn increasing attention for its ubiquitous real-life applications in traffic management, urban computing, public safety, and so on. Recently, the availability of massive trajectory data and the success of deep learning motivate \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Min:2021:SCS, author = "Seunghwan Min and Sung Gwan Park and Kunsoo Park and Dora Giammarresi and Giuseppe F. Italiano and Wook-Shin Han", title = "Symmetric continuous subgraph matching with bidirectional dynamic programming", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "8", pages = "1298--1310", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3457390.3457395", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:31 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3457390.3457395", abstract = "In many real datasets such as social media streams and cyber data sources, graphs change over time through a graph update stream of edge insertions and deletions. Detecting critical patterns in such dynamic graphs plays an important role in various \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Suzuki:2021:ADP, author = "Tomoya Suzuki and Kazuhiro Hiwada and Hirotsugu Kajihara and Shintaro Sano and Shuou Nomura and Tatsuo Shiozawa", title = "Approaching {DRAM} performance by using microsecond-latency flash memory for small-sized random read accesses: a new access method and its graph applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "8", pages = "1311--1324", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3457390.3457397", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:31 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3457390.3457397", abstract = "For applications in which small-sized random accesses frequently occur for datasets that exceed DRAM capacity, placing the datasets on SSD can result in poor application performance. For the read-intensive case we focus on in this paper, low latency \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Orogat:2021:CTB, author = "Abdelghny Orogat and Isabelle Liu and Ahmed El-Roby", title = "{CBench}: towards better evaluation of question answering over knowledge graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "8", pages = "1325--1337", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3457390.3457398", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:31 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3457390.3457398", abstract = "Recently, there has been an increase in the number of knowledge graphs that can be only queried by experts. However, describing questions using structured queries is not straightforward for non-expert users who need to have sufficient knowledge about \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yuan:2021:TRA, author = "Binhang Yuan and Dimitrije Jankov and Jia Zou and Yuxin Tang and Daniel Bourgeois and Chris Jermaine", title = "Tensor relational algebra for distributed machine learning system design", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "8", pages = "1338--1350", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3457390.3457399", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:31 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3457390.3457399", abstract = "We consider the question: what is the abstraction that should be implemented by the computational engine of a machine learning system? Current machine learning systems typically push whole tensors through a series of compute kernels such as matrix \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2021:PDD, author = "Wenfei Fan and Chao Tian and Yanghao Wang and Qiang Yin", title = "Parallel discrepancy detection and incremental detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "8", pages = "1351--1364", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3457390.3457400", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:31 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3457390.3457400", abstract = "This paper studies how to catch duplicates, mismatches and conflicts in the same process. We adopt a class of entity enhancing rules that embed machine learning predicates, unify entity resolution and conflict resolution, and are collectively defined \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2021:TCA, author = "Tiantian Liu and Huan Li and Hua Lu and Muhammad Aamir Cheema and Lidan Shou", title = "Towards crowd-aware indoor path planning", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "8", pages = "1365--1377", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3457390.3457401", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:31 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3457390.3457401", abstract = "Indoor venues accommodate many people who collectively form crowds. Such crowds in turn influence people's routing choices, e.g., people may prefer to avoid crowded rooms when walking from A to B. This paper studies two types of crowd-aware indoor path \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gupta:2021:PES, author = "Surabhi Gupta and Karthik Ramachandra", title = "Procedural extensions of {SQL}: understanding their usage in the wild", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "8", pages = "1378--1391", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3457390.3457402", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:31 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3457390.3457402", abstract = "Procedural extensions of SQL have been in existence for many decades now. However, little is known about their magnitude of usage and their complexity in real-world workloads. Procedural code executing in a RDBMS is known to have inefficiencies and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bharadwaj:2021:DRD, author = "Sagar Bharadwaj and Praveen Gupta and Ranjita Bhagwan and Saikat Guha", title = "Discovering related data at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "8", pages = "1392--1400", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3457390.3457403", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:31 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3457390.3457403", abstract = "Analysts frequently require data from multiple sources for their tasks, but finding these sources is challenging in exabyte-scale data lakes. In this paper, we address this problem for our enterprise's data lake by using machine-learning to identify \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cereda:2021:CCG, author = "Stefano Cereda and Stefano Valladares and Paolo Cremonesi and Stefano Doni", title = "{CGPTuner}: a contextual {Gaussian} process bandit approach for the automatic tuning of {IT} configurations under varying workload conditions", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "8", pages = "1401--1413", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3457390.3457404", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:31 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3457390.3457404", abstract = "Properly selecting the configuration of a database management system (DBMS) is essential to increase performance and reduce costs. However, the task is astonishingly tricky due to a large number of tunable configuration parameters and their inter-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schiavio:2021:LAI, author = "Filippo Schiavio and Daniele Bonetta and Walter Binder", title = "Language-agnostic integrated queries in a managed polyglot runtime", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "8", pages = "1414--1426", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3457390.3457405", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:31 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3457390.3457405", abstract = "Language-integrated query (LINQ) frameworks offer a convenient programming abstraction for processing in-memory collections of data, allowing developers to concisely express declarative queries using general-purpose programming languages. Existing LINQ", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kulkarni:2021:AHT, author = "Chinmay Kulkarni and Badrish Chandramouli and Ryan Stutsman", title = "Achieving high throughput and elasticity in a larger-than-memory store", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "8", pages = "1427--1440", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3457390.3457406", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:31 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3457390.3457406", abstract = "Millions of sensors, mobile applications and machines now generate billions of events. Specialized many-core key-value stores (KVSs) can ingest and index these events at high rates (over 100 Mops/s on one machine) if events are generated on the same \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yao:2021:ESB, author = "Kai Yao and Lijun Chang", title = "Efficient size-bounded community search over large networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "8", pages = "1441--1453", month = apr, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3457390.3457407", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:31 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3457390.3457407", abstract = "The problem of community search, which aims to find a cohesive subgraph containing user-given query vertices, has been extensively studied recently. Most of the existing studies mainly focus on the cohesiveness of the returned community, while ignoring \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2021:MVA, author = "Jianwen Zhao and Yufei Tao", title = "Minimum vertex augmentation", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1454--1466", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461536", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461536", abstract = "This paper introduces a class of graph problems named minimum vertex augmentation (MVA). Given an input graph G where each vertex carries a binary color 0 or 1, we want to flip the colors of the fewest 0-vertices such that the subgraph induced by all \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gaffney:2021:DIS, author = "Kevin P. Gaffney and Robert Claus and Jignesh M. Patel", title = "Database isolation by scheduling", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1467--1480", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461537", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461537", abstract = "Transaction isolation is conventionally achieved by restricting access to the physical items in a database. To maximize performance, isolation functionality is often packaged with recovery, I/O, and data access methods in a monolithic transactional \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Park:2021:SSS, author = "Jong-Hyeok Park and Soyee Choi and Gihwan Oh and Sang-Won Lee", title = "{SaS}: {SSD} as {SQL} database system", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1481--1488", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461538", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461538", abstract = "Every database engine runs on top of an operating system in the host, strictly separated with the storage. This more-than-half-century-old IHDE (In-Host-Database-Engine) architecture, however, reveals its limitations when run on fast flash memory SSDs. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2021:FFL, author = "Rong Zhu and Ziniu Wu and Yuxing Han and Kai Zeng and Andreas Pfadler and Zhengping Qian and Jingren Zhou and Bin Cui", title = "{FLAT}: fast, lightweight and accurate method for cardinality estimation", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1489--1502", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461539", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461539", abstract = "Query optimizers rely on accurate cardinality estimation (CardEst) to produce good execution plans. The core problem of CardEst is how to model the rich joint distribution of attributes in an accurate and compact manner. Despite decades of research, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chan:2021:FAA, author = "Tsz Nam Chan and Zhe Li and Leong Hou U. and Jianliang Xu and Reynold Cheng", title = "Fast augmentation algorithms for network kernel density visualization", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1503--1516", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461540", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461540", abstract = "Network kernel density visualization, or NKDV, has been extensively used to visualize spatial data points in various domains, including traffic accident hotspot detection, crime hotspot detection, disease outbreak detection, and business and urban \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2021:AAG, author = "Jiawei Wang and Cheng Li and Kai Ma and Jingze Huo and Feng Yan and Xinyu Feng and Yinlong Xu", title = "{AUTOGR}: automated geo-replication with fast system performance and preserved application semantics", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1517--1530", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461541", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461541", abstract = "Geo-replication is essential for providing low latency response and quality Internet services. However, designing fast and correct geo-replicated services is challenging due to the complex trade-off between performance and consistency semantics in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2021:LAD, author = "Qing Liu and Xuliang Zhu and Xin Huang and Jianliang Xu", title = "Local algorithms for distance-generalized core decomposition over large dynamic graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1531--1543", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461542", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461542", abstract = "The distance-generalized core, also called ( k, h )-core, is defined as the maximal subgraph in which every vertex has at least k vertices at distance no longer than h. Compared with k -core, ( k, h )-core can identify more fine-grained subgraphs and, hence, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Benson:2021:VEH, author = "Lawrence Benson and Hendrik Makait and Tilmann Rabl", title = "{Viper}: an efficient hybrid {PMem-DRAM} key-value store", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1544--1556", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461543", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461543", abstract = "Key-value stores (KVSs) have found wide application in modern software systems. For persistence, their data resides in slow secondary storage, which requires KVSs to employ various techniques to increase their read and write performance from and to the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zeighami:2021:ESC, author = "Sepanta Zeighami and Cyrus Shahabi and John Krumm", title = "Estimating spread of contact-based contagions in a population through sub-sampling", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1557--1569", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461544", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461544", abstract = "Various phenomena such as viruses, gossips, and physical objects (e.g., packages and marketing pamphlets) can be spread through physical contacts. The spread depends on how people move, i.e., their mobility patterns. In practice, mobility patterns of an \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Herodotou:2021:TTS, author = "Herodotos Herodotou and Elena Kakoulli", title = "{Trident}: task scheduling over tiered storage systems in big data platforms", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1570--1582", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461545", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461545", abstract = "The recent advancements in storage technologies have popularized the use of tiered storage systems in data-intensive compute clusters. The Hadoop Distributed File System (HDFS), for example, now supports storing data in memory, SSDs, and HDDs, while \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cong:2021:CCE, author = "Zicun Cong and Lingyang Chu and Yu Yang and Jian Pei", title = "Comprehensible counterfactual explanation on {Kolmogorov--Smirnov} test", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1583--1596", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461546", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461546", abstract = "The Kolmogorov--Smirnov (KS) test is popularly used in many applications, such as anomaly detection, astronomy, database security and AI systems. One challenge remained untouched is how we can obtain an explanation on why a test set fails the KS test. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2021:ALS, author = "Hongkuan Zhou and Ajitesh Srivastava and Hanqing Zeng and Rajgopal Kannan and Viktor Prasanna", title = "Accelerating large scale real-time {GNN} inference using channel pruning", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1597--1605", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461547", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461547", abstract = "Graph Neural Networks (GNNs) are proven to be powerful models to generate node embedding for downstream applications. However, due to the high computation complexity of GNN inference, it is hard to deploy GNNs for large-scale or real-time applications. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Leis:2021:TCO, author = "Viktor Leis and Maximilian Kuschewski", title = "Towards cost-optimal query processing in the cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1606--1612", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461549", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461549", abstract = "Public cloud providers offer hundreds of heterogeneous hardware instances. For analytical query processing systems, this presents a major challenge: depending on the hardware configuration, performance and cost may differ by orders of magnitude. We \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gong:2021:AIG, author = "Shufeng Gong and Chao Tian and Qiang Yin and Wenyuan Yu and Yanfeng Zhang and Liang Geng and Song Yu and Ge Yu and Jingren Zhou", title = "Automating incremental graph processing with flexible memoization", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1613--1625", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461550", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461550", abstract = "The ever-growing amount of dynamic graph data demands efficient techniques of incremental graph processing. However, incremental graph algorithms are challenging to develop. Existing approaches usually require users to manually design nontrivial \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jepsen:2021:NST, author = "Theo Jepsen and Alberto Lerner and Fernando Pedone and Robert Soul{\'e} and Philippe Cudr{\'e}-Mauroux", title = "In-network support for transaction triaging", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1626--1639", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461551", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461551", abstract = "We introduce Transaction Triaging, a set of techniques that manipulate streams of transaction requests and responses while they travel to and from a database server. Compared to normal transaction streams, the triaged ones execute faster once they reach \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2021:WRL, author = "Xiaoying Wang and Changbo Qu and Weiyuan Wu and Jiannan Wang and Qingqing Zhou", title = "Are we ready for learned cardinality estimation?", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1640--1654", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461552", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461552", abstract = "Cardinality estimation is a fundamental but long unresolved problem in query optimization. Recently, multiple papers from different research groups consistently report that learned models have the potential to replace existing cardinality estimators. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lemiesz:2021:ADS, author = "Jakub Lemiesz", title = "On the algebra of data sketches", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1655--1667", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461553", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461553", abstract = "We consider the problem of designing a distributed data sketch for scenario in which data stream is observed by many independent network nodes. We require that a sketch apart from being computationally and memory efficient should also be mergeable in a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hou:2021:MPA, author = "Guanhao Hou and Xingguang Chen and Sibo Wang and Zhewei Wei", title = "Massively parallel algorithms for {Personalized Pagerank}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1668--1680", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461554", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461554", abstract = "Personalized PageRank (PPR) has wide applications in search engines, social recommendations, community detection, and so on. Nowadays, graphs are becoming massive and many IT companies need to deal with large graphs that cannot be fitted into the memory of most commodity servers. However, most existing state-of-the-art solutions for PPR computation only work for single-machines and are inefficient for the distributed framework since such solutions either (i) result in an excessively large number of communication rounds, or (ii) incur high communication costs in each round. Motivated by this, we present Delta-Push, an efficient framework for single-source and top-$k$ PPR queries in distributed settings. Our goal is to reduce the number of rounds while guaranteeing that the load, i.e., the maximum number of messages an executor sends or receives in a round, can be bounded by the capacity of each executor. We first present a non-trivial combination of a redesigned parallel push algorithm and the Monte-Carlo method to answer single-source PPR queries. The solution uses pre-sampled random walks to reduce the number of rounds for the push algorithm. Theoretical analysis under the Massively Parallel Computing (MPC) model shows that our proposed solution bounds the communication rounds to [EQUATION] under a load of O(m/p), where m is the number of edges of the input graph, p is the number of executors, and $ \epsilon $ is a user-defined error parameter. In the meantime, as the number of executors increases to $ p' = \gamma \cdot p$, the load constraint can be relaxed since each executor can hold $ O(\gamma \cdot m / p')$ messages with invariant local memory. In such scenarios, multiple queries can be processed in batches simultaneously. We show that with a load of $ O(\gamma \cdot m / p')$, our Delta-Push can process $ \gamma $ queries in a batch with [EQUATION] rounds, while other baseline solutions still keep the same round cost for each batch. We further present a new top-$k$ algorithm that is friendly to the distributed framework and reduces the number of rounds required in practice. Extensive experiments show that our proposed solution is more efficient than alternatives.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schleich:2021:GQC, author = "Maximilian Schleich and Zixuan Geng and Yihong Zhang and Dan Suciu", title = "{GeCo}: quality counterfactual explanations in real time", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1681--1693", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3461555", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3461555", abstract = "Machine learning is increasingly applied in high-stakes decision making that directly affect people's lives, and this leads to an increased demand for systems to explain their decisions. Explanations often take the form of counterfactuals, which \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Salazar:2021:AFE, author = "Ricardo Salazar and Felix Neutatz and Ziawasch Abedjan", title = "Automated feature engineering for algorithmic fairness", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "9", pages = "1694--1702", month = may, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3461535.3463474", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 23 06:39:32 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3461535.3463474", abstract = "One of the fundamental problems of machine ethics is to avoid the perpetuation and amplification of discrimination through machine learning applications. In particular, it is desired to exclude the influence of attributes with sensitive information, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Addanki:2021:HDR, author = "Raghavendra Addanki and Sainyam Galhotra and Barna Saha", title = "How to design robust algorithms using noisy comparison {Oracle}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1703--1716", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467862", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467862", abstract = "Metric based comparison operations such as finding maximum, nearest and farthest neighbor are fundamental to studying various clustering techniques such as k -center clustering and agglomerative hierarchical clustering. These techniques crucially rely on \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Boniol:2021:SSS, author = "Paul Boniol and John Paparrizos and Themis Palpanas and Michael J. Franklin", title = "{SAND}: streaming subsequence anomaly detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1717--1729", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467863", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467863", abstract = "With the increasing demand for real-time analytics and decision making, anomaly detection methods need to operate over streams of values and handle drifts in data distribution. Unfortunately, existing approaches have severe limitations: they either \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xiao:2021:OFU, author = "Yingtai Xiao and Zeyu Ding and Yuxin Wang and Danfeng Zhang and Daniel Kifer", title = "Optimizing fitness-for-use of differentially private linear queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1730--1742", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467864", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467864", abstract = "In practice, differentially private data releases are designed to support a variety of applications. A data release is fit for use if it meets target accuracy requirements for each application. In this paper, we consider the problem of answering linear \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cao:2021:CED, author = "Xinle Cao and Jian Liu and Hao Lu and Kui Ren", title = "Cryptanalysis of an encrypted database in {SIGMOD '14}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1743--1755", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467865", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467865", abstract = "Encrypted database is an innovative technology proposed to solve the data confidentiality issue in cloud-based DB systems. It allows a data owner to encrypt its database before uploading it to the service provider; and it allows the service provider to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jin:2021:USM, author = "Tianyuan Jin and Yu Yang and Renchi Yang and Jieming Shi and Keke Huang and Xiaokui Xiao", title = "Unconstrained submodular maximization with modular costs: tight approximation and application to profit maximization", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1756--1768", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467866", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467866", abstract = "Given a set V, the problem of unconstrained submodular maximization with modular costs (USM-MC) asks for a subset $ S \subseteq $ V that maximizes $ f(S) $ --- $ c(S) $, where $f$ is a non-negative, monotone, and submodular function that gauges the utility of S, and c is a non-\ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2021:DDL, author = "Yuhao Zhang and Frank McQuillan and Nandish Jayaram and Nikhil Kak and Ekta Khanna and Orhan Kislal and Domino Valdano and Arun Kumar", title = "Distributed deep learning on data systems: a comparative analysis of approaches", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1769--1782", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467867", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467867", abstract = "Deep learning (DL) is growing in popularity for many data analytics applications, including among enterprises. Large business-critical datasets in such settings typically reside in RDBMSs or other data systems. The DB community has long aimed to bring \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sheng:2021:PSM, author = "Siyuan Sheng and Qun Huang and Sa Wang and Yungang Bao", title = "{PR}-sketch: monitoring per-key aggregation of streaming data with nearly full accuracy", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1783--1796", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467868", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467868", abstract = "Computing per-key aggregation is indispensable in streaming data analysis formulated as two phases, an update phase and a recovery phase. As the size and speed of data streams rise, accurate per-key information is useful in many applications like \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Koutsoukos:2021:TAG, author = "Dimitrios Koutsoukos and Supun Nakandala and Konstantinos Karanasos and Karla Saur and Gustavo Alonso and Matteo Interlandi", title = "{Tensors}: an abstraction for general data processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1797--1804", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467869", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467869", abstract = "Deep Learning (DL) has created a growing demand for simpler ways to develop complex models and efficient ways to execute them. Thus, a significant effort has gone into frameworks like PyTorch or TensorFlow to support a variety of DL models and run \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pujol:2021:BSM, author = "David Pujol and Yikai Wu and Brandon Fain and Ashwin Machanavajjhala", title = "Budget sharing for multi-analyst differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1805--1817", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467870", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467870", abstract = "Large organizations that collect data about populations (like the US Census Bureau) release summary statistics that are used by multiple stakeholders for resource allocation and policy making problems. These organizations are also legally required to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Poepsel-Lemaitre:2021:LDS, author = "Rudi Poepsel-Lemaitre and Martin Kiefer and Joscha von Hein and Jorge-Arnulfo Quian{\'e}-Ruiz and Volker Markl", title = "In the land of data streams where synopses are missing, one framework to bring them all", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1818--1831", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467871", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467871", abstract = "In pursuit of real-time data analysis, approximate summarization structures, i.e., synopses, have gained importance over the years. However, existing stream processing systems, such as Flink, Spark, and Storm, do not support synopses as first class \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2021:DAI, author = "Yifan Li and Xiaohui Yu and Nick Koudas", title = "Data acquisition for improving machine learning models", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1832--1844", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467872", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467872", abstract = "The vast advances in Machine Learning (ML) over the last ten years have been powered by the availability of suitably prepared data for training purposes. The future of ML-enabled enterprise hinges on data. As such, there is already a vibrant market \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2021:EAR, author = "Xiaoshuang Chen and Kai Wang and Xuemin Lin and Wenjie Zhang and Lu Qin and Ying Zhang", title = "Efficiently answering reachability and path queries on temporal bipartite graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1845--1858", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467873", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467873", abstract = "Bipartite graphs are naturally used to model relationships between two different types of entities, such as people-location, author-paper, and customer-product. When modeling real-world applications like disease outbreaks, edges are often enriched with \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ciaccia:2021:PQT, author = "Paolo Ciaccia and Davide Martinenghi and Riccardo Torlone", title = "Preference queries over taxonomic domains", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1859--1871", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467874", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467874", abstract = "When composing multiple preferences characterizing the most suitable results for a user, several issues may arise. Indeed, preferences can be partially contradictory, suffer from a mismatch with the level of detail of the actual data, and even lack \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yan:2021:RDL, author = "Baoyue Yan and Xuntao Cheng and Bo Jiang and Shibin Chen and Canfang Shang and Jianying Wang and Gui Huang and Xinjun Yang and Wei Cao and Feifei Li", title = "Revisiting the design of {LSM}-tree Based {OLTP} storage engine with persistent memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1872--1885", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467875", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467875", abstract = "The recent byte-addressable and large-capacity commercialized persistent memory (PM) is promising to drive database as a service (DBaaS) into unchartered territories. This paper investigates how to leverage PMs to revisit the conventional LSM-tree based \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ge:2021:KCA, author = "Chang Ge and Shubhankar Mohapatra and Xi He and Ihab F. Ilyas", title = "{Kamino}: constraint-aware differentially private data synthesis", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1886--1899", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467876", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467876", abstract = "Organizations are increasingly relying on data to support decisions. When data contains private and sensitive information, the data owner often desires to publish a synthetic database instance that is similarly useful as the true data, while ensuring \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2021:TCE, author = "Yingqiang Zhang and Chaoyi Ruan and Cheng Li and Xinjun Yang and Wei Cao and Feifei Li and Bo Wang and Jing Fang and Yuhui Wang and Jingze Huo and Chao Bi", title = "Towards cost-effective and elastic cloud database deployment via memory disaggregation", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1900--1912", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467877", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467877", abstract = "It is challenging for cloud-native relational databases to meet the ever-increasing needs of scaling compute and memory resources independently and elastically. The recent emergence of memory disaggregation architecture, relying on high-speed RDMA \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Peeters:2021:DOF, author = "Ralph Peeters and Christian Bizer", title = "Dual-objective fine-tuning of {BERT} for entity matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "10", pages = "1913--1921", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3467861.3467878", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Oct 27 15:40:22 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3467861.3467878", abstract = "An increasing number of data providers have adopted shared numbering schemes such as GTIN, ISBN, DUNS, or ORCID numbers for identifying entities in the respective domain. This means for data integration that shared identifiers are often available for a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Besta:2021:GEH, author = "Maciej Besta and Zur Vonarburg-Shmaria and Yannick Schaffner and Leonardo Schwarz and Grzegorz Kwasniewski and Lukas Gianinazzi and Jakub Beranek and Kacper Janda and Tobias Holenstein and Sebastian Leisinger and Peter Tatkowski and Esref Ozdemir and Adrian Balla and Marcin Copik and Philipp Lindenberger and Marek Konieczny and Onur Mutlu and Torsten Hoefler", title = "{GraphMineSuite}: enabling high-performance and programmable graph mining algorithms with set algebra", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "1922--1935", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476252", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476252", abstract = "We propose GraphMineSuite (GMS): the first benchmarking suite for graph mining that facilitates evaluating and constructing high-performance graph mining algorithms. First, GMS comes with a benchmark specification based on extensive literature review, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Takenouchi:2021:PES, author = "Keita Takenouchi and Takashi Ishio and Joji Okada and Yuji Sakata", title = "{PATSQL}: efficient synthesis of {SQL} queries from example tables with quick inference of projected columns", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "1937--1949", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476253", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476253", abstract = "SQL is one of the most popular tools for data analysis, and it is now used by an increasing number of users without having expertise in databases. Several studies have proposed programming-by-example approaches to help such non-experts to write correct \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2021:FFA, author = "Jie Liu and Wenqian Dong and Qingqing Zhou and Dong Li", title = "{Fauce}: fast and accurate deep ensembles with uncertainty for cardinality estimation", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "1950--1963", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476254", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476254", abstract = "Cardinality estimation is a fundamental and critical problem in databases. Recently, many estimators based on deep learning have been proposed to solve this problem and they have achieved promising results. However, these estimators struggle to provide \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2021:CSE, author = "Mengzhao Wang and Xiaoliang Xu and Qiang Yue and Yuxiang Wang", title = "A comprehensive survey and experimental comparison of graph-based approximate nearest neighbor search", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "1964--1978", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476255", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476255", abstract = "Approximate nearest neighbor search (ANNS) constitutes an important operation in a multitude of applications, including recommendation systems, information retrieval, and pattern recognition. In the past decade, graph-based ANNS algorithms have been the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yuan:2021:TPP, author = "Zifeng Yuan and Huey Eng Chua and Sourav S. Bhowmick and Zekun Ye and Wook-Shin Han and Byron Choi", title = "Towards plug-and-play visual graph query interfaces: data-driven selection of canned patterns for large networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "1979--1991", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476256", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476256", abstract = "Canned patterns ( i.e., small subgraph patterns) in visual graph query interfaces (a.k.a GUI) facilitate efficient query formulation by enabling pattern-at-a-time construction mode. However, existing GUIS for querying large networks either do not expose \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2021:TMG, author = "Shixuan Sun and Yuhang Chen and Shengliang Lu and Bingsheng He and Yuchen Li", title = "{ThunderRW}: an in-memory graph random walk engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "1992--2005", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476257", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476257", abstract = "As random walk is a powerful tool in many graph processing, mining and learning applications, this paper proposes an efficient in-memory random walk engine named ThunderRW. Compared with existing parallel systems on improving the performance of a single \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dong:2021:BCC, author = "Zheng Dong and Xin Huang and Guorui Yuan and Hengshu Zhu and Hui Xiong", title = "Butterfly-core community search over labeled graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2006--2018", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476258", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476258", abstract = "Community search aims at finding densely connected subgraphs for query vertices in a graph. While this task has been studied widely in the literature, most of the existing works only focus on finding homogeneous communities rather than heterogeneous \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Negi:2021:FLL, author = "Parimarjan Negi and Ryan Marcus and Andreas Kipf and Hongzi Mao and Nesime Tatbul and Tim Kraska and Mohammad Alizadeh", title = "{Flow-loss}: learning cardinality estimates that matter", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2019--2032", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476259", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476259", abstract = "Recently there has been significant interest in using machine learning to improve the accuracy of cardinality estimation. This work has focused on improving average estimation error, but not all estimates matter equally for downstream tasks like query \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yu:2021:QHK, author = "Michael Yu and Dong Wen and Lu Qin and Ying Zhang and Wenjie Zhang and Xuemin Lin", title = "On querying historical $k$-cores", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2033--2045", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476260", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476260", abstract = "Many real-world relationships between entities can be modeled as temporal graphs, where each edge is associated with a timestamp or a time interval representing its occurrence. K -core is a fundamental model used to capture cohesive subgraphs in a simple \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cormode:2021:FEU, author = "Graham Cormode and Samuel Maddock and Carsten Maple", title = "Frequency estimation under local differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2046--2058", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476261", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476261", abstract = "Private collection of statistics from a large distributed population is an important problem, and has led to large scale deployments from several leading technology companies. The dominant approach requires each user to randomly perturb their input, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zogaj:2021:DML, author = "Fatjon Zogaj and Jos{\'e} Pablo Cambronero and Martin C. Rinard and J{\"u}rgen Cito", title = "Doing more with less: characterizing dataset downsampling for {AutoML}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2059--2072", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476262", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476262", abstract = "Automated machine learning (AutoML) promises to democratize machine learning by automatically generating machine learning pipelines with little to no user intervention. Typically, a search procedure is used to repeatedly generate and validate candidate \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2021:LBE, author = "Yifan Li and Xiaohui Yu and Nick Koudas", title = "{LES 3}: learning-based exact set similarity search", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2073--2086", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476263", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476263", abstract = "Set similarity search is a problem of central interest to a wide variety of applications such as data cleaning and web search. Past approaches on set similarity search utilize either heavy indexing structures, incurring large search costs or indexes \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Min:2021:LGC, author = "Seung Won Min and Kun Wu and Sitao Huang and Mert Hidayetoglu and Jinjun Xiong and Eiman Ebrahimi and Deming Chen and Wen-mei Hwu", title = "Large graph convolutional network training with {GPU}-oriented data communication architecture", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2087--2100", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476264", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476264", abstract = "Graph Convolutional Networks (GCNs) are increasingly adopted in large-scale graph-based recommender systems. Training GCN requires the minibatch generator traversing graphs and sampling the sparsely located neighboring nodes to obtain their features. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2021:FHP, author = "Yifei Yang and Matt Youill and Matthew Woicik and Yizhou Liu and Xiangyao Yu and Marco Serafini and Ashraf Aboulnaga and Michael Stonebraker", title = "{FlexPushdownDB}: hybrid pushdown and caching in a cloud {DBMS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2101--2113", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476265", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476265", abstract = "Modern cloud databases adopt a storage-disaggregation architecture that separates the management of computation and storage. A major bottleneck in such an architecture is the network connecting the computation and storage layers. Two solutions have been \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2021:AMA, author = "Zhiwei Chen and Shaoxu Song and Ziheng Wei and Jingyun Fang and Jiang Long", title = "Approximating median absolute deviation with bounded error", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2114--2126", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476266", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476266", abstract = "The median absolute deviation (MAD) is a statistic measuring the variability of a set of quantitative elements. It is known to be more robust to outliers than the standard deviation (SD), and thereby widely used in outlier detection. Computing the exact \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2021:EEG, author = "Mengxuan Zhang and Lei Li and Xiaofang Zhou", title = "An experimental evaluation and guideline for path finding in weighted dynamic network", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2127--2140", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476267", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476267", abstract = "Shortest path computation is a building block of various network applications. Since real-life networks evolve as time passes, the Dynamic Shortest Path (DSP) problem has drawn lots of attention in recent years. However, as DSP has many factors related \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Vandevoort:2021:RAR, author = "Brecht Vandevoort and Bas Ketsman and Christoph Koch and Frank Neven", title = "Robustness against read committed for transaction templates", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2141--2153", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476268", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476268", abstract = "The isolation level Multiversion Read Committed (RC), offered by many database systems, is known to trade consistency for increased transaction throughput. Sometimes, transaction workloads can be safely executed under RC obtaining the perfect isolation \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2021:LLC, author = "Huayi Zhang and Lei Cao and Samuel Madden and Elke Rundensteiner", title = "{LANCET}: labeling complex data at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2154--2166", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476269", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476269", abstract = "Cutting-edge machine learning techniques often require millions of labeled data objects to train a robust model. Because relying on humans to supply such a huge number of labels is rarely practical, automated methods for label generation are needed. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2021:VSE, author = "Yang Li and Yu Shen and Wentao Zhang and Jiawei Jiang and Bolin Ding and Yaliang Li and Jingren Zhou and Zhi Yang and Wentao Wu and Ce Zhang and Bin Cui", title = "{VolcanoML}: speeding up end-to-end {AutoML} via scalable search space decomposition", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2167--2176", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476270", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476270", abstract = "End-to-end AutoML has attracted intensive interests from both academia and industry, which automatically searches for ML pipelines in a space induced by feature engineering, algorithm/model selection, and hyper-parameter tuning. Existing AutoML systems, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cheng:2021:QTF, author = "Peng Cheng and Jiabao Jin and Lei Chen and Xuemin Lin and Libin Zheng", title = "A queueing-theoretic framework for vehicle dispatching in dynamic car-hailing", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2177--2189", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476271", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476271", abstract = "With the rapid development of smart mobile devices, the car-hailing platforms (e.g., Uber or Lyft) have attracted much attention from the academia and the industry. In this paper, we consider a dynamic car-hailing problem, namely maximum revenue vehicle \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cai:2021:DSD, author = "Kuntai Cai and Xiaoyu Lei and Jianxin Wei and Xiaokui Xiao", title = "Data synthesis via differentially private {Markov} random fields", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2190--2202", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476272", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476272", abstract = "This paper studies the synthesis of high-dimensional datasets with differential privacy (DP). The state-of-the-art solution addresses this problem by first generating a set M of noisy low-dimensional marginals of the input data D, and then use them to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Whittaker:2021:SRS, author = "Michael Whittaker and Ailidani Ailijiang and Aleksey Charapko and Murat Demirbas and Neil Giridharan and Joseph M. Hellerstein and Heidi Howard and Ion Stoica and Adriana Szekeres", title = "Scaling replicated state machines with compartmentalization", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2203--2215", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476273", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476273", abstract = "State machine replication protocols, like MultiPaxos and Raft, are a critical component of many distributed systems and databases. However, these protocols offer relatively low throughput due to several bottlenecked components. Numerous existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sarkar:2021:CAL, author = "Subhadeep Sarkar and Dimitris Staratzis and Ziehen Zhu and Manos Athanassoulis", title = "Constructing and analyzing the {LSM} compaction design space", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2216--2229", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476274", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476274", abstract = "Log-structured merge (LSM) trees offer efficient ingestion by appending incoming data, and thus, are widely used as the storage layer of production NoSQL data stores. To enable competitive read performance, LSM-trees periodically re-organize data to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hellings:2021:BSB, author = "Jelle Hellings and Mohammad Sadoghi", title = "{ByShard}: sharding in a {Byzantine} environment", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2230--2243", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476275", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476275", abstract = "The emergence of blockchains has fueled the development of resilient systems that can deal with Byzantine failures due to crashes, bugs, or even malicious behavior. Recently, we have also seen the exploration of sharding in these resilient systems, this \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ertl:2021:SFG, author = "Otmar Ertl", title = "{SetSketch}: filling the gap between {MinHash} and {HyperLogLog}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2244--2257", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476276", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476276", abstract = "MinHash and HyperLogLog are sketching algorithms that have become indispensable for set summaries in big data applications. While HyperLogLog allows counting different elements with very little space, MinHash is suitable for the fast comparison of sets \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bao:2021:CEM, author = "Ergute Bao and Yin Yang and Xiaokui Xiao and Bolin Ding", title = "{CGM}: an enhanced mechanism for streaming data collection with local differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2258--2270", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476277", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476277", abstract = "Local differential privacy (LDP) is a well-established privacy protection scheme for collecting sensitive data, which has been integrated into major platforms such as iOS, Chrome, and Windows. The main idea is that each individual randomly perturbs her \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Leo:2021:ETA, author = "Dean {De Leo} and Per Fuchs and Peter Boncz", title = "Errata for {``Teseo and the analysis of structural dynamic graphs'': (PVLDB {\bf 14}(6):1053--1066)}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2271--2272", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476278", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", note = "See \cite{DeLeo:2021:TAS}.", URL = "https://dl.acm.org/doi/10.14778/3476249.3476278", abstract = "In our paper [4], we experimentally evaluated our work, Teseo, together with five other systems under the LDBC Graphalytics benchmark [6]. We developed and publicly released [2] an ad-hoc driver for the purpose. Since the time the paper was published, a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Musleh:2021:QMB, author = "Mashaal Musleh and Sofiane Abbar and Rade Stanojevic and Mohamed Mokbel", title = "{QARTA}: an {ML}-based system for accurate map services", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2273--2282", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476279", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476279", abstract = "Maps services are ubiquitous in widely used applications including navigation systems, ride sharing, and items/food delivery. Though there are plenty of efforts to support such services through designing more efficient algorithms, we believe that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cunningham:2021:RWT, author = "Teddy Cunningham and Graham Cormode and Hakan Ferhatosmanoglu and Divesh Srivastava", title = "Real-world trajectory sharing with local differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2283--2295", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476280", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476280", abstract = "Sharing trajectories is beneficial for many real-world applications, such as managing disease spread through contact tracing and tailoring public services to a population's travel patterns. However, public concern over privacy and data protection has \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sinthong:2021:PRQ, author = "Phanwadee Sinthong and Michael J. Carey", title = "{PolyFrame}: a retargetable query-based approach to scaling dataframes", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2296--2304", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476281", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 06:21:49 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476281", abstract = "In the last few years, the field of data science has been growing rapidly as various businesses have adopted statistical and machine learning techniques to empower their decision-making and applications. Scaling data analyses to large volumes of data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shi:2021:SCD, author = "Jessica Shi and Laxman Dhulipala and David Eisenstat and Jakub Lacki and Vahab Mirrokni", title = "Scalable community detection via parallel correlation clustering", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2305--2313", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476282", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476282", abstract = "Graph clustering and community detection are central problems in modern data mining. The increasing need for analyzing billion-scale data calls for faster and more scalable algorithms for these problems. There are certain trade-offs between the quality \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xu:2021:SSB, author = "Cheng Xu and Ce Zhang and Jianliang Xu and Jian Pei", title = "{SlimChain}: scaling blockchain transactions through off-chain storage and parallel processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2314--2326", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476283", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476283", abstract = "Blockchain technology has emerged as the cornerstone of many decentralized applications operating among otherwise untrusted peers. However, it is well known that existing blockchain systems do not scale well. Transactions are often executed and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2021:TOG, author = "Side Li and Arun Kumar", title = "Towards an optimized {GROUP} by abstraction for large-scale machine learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2327--2340", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476284", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476284", abstract = "Many applications that use large-scale machine learning (ML) increasingly prefer different models for subgroups (e.g., countries) to improve accuracy, fairness, or other desiderata. We call this emerging popular practice learning over groups, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kang:2021:AAA, author = "Daniel Kang and John Guibas and Peter Bailis and Tatsunori Hashimoto and Yi Sun and Matei Zaharia", title = "Accelerating approximate aggregation queries with expensive predicates", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2341--2354", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476285", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476285", abstract = "Researchers and industry analysts are increasingly interested in computing aggregation queries over large, unstructured datasets with selective predicates that are computed using expensive deep neural networks (DNNs). As these DNNs are expensive and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schmidt:2021:FDA, author = "Tobias Schmidt and Maximilian Bandle and Jana Giceva", title = "A four-dimensional analysis of partitioned approximate filters", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2355--2368", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476286", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476286", abstract = "With today's data deluge, approximate filters are particularly attractive to avoid expensive operations like remote data/disk accesses. Among the many filter variants available, it is non-trivial to find the most suitable one and its optimal \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chiosa:2021:SOP, author = "Monica Chiosa and Thomas B. Preu{\ss}er and Gustavo Alonso", title = "{SKT}: a one-pass multi-sketch data analytics accelerator", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2369--2382", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476287", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476287", abstract = "Data analysts often need to characterize a data stream as a first step to its further processing. Some of the initial insights to be gained include, e.g., the cardinality of the data set and its frequency distribution. Such information is typically \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fent:2021:PAG, author = "Philipp Fent and Thomas Neumann", title = "A practical approach to groupjoin and nested aggregates", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2383--2396", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476288", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476288", abstract = "Groupjoins, the combined execution of a join and a subsequent group by, are common in analytical queries, and occur in about 1/8 of the queries in TPC-H and TPC-DS. While they were originally invented to improve performance, efficient parallel execution \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wei:2021:RVQ, author = "Ziyun Wei and Immanuel Trummer and Connor Anderson", title = "Robust voice querying with {MUVE}: optimally visualizing results of phonetically similar queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2397--2409", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476289", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476289", abstract = "Recently proposed voice query interfaces translate voice input into SQL queries. Unreliable speech recognition on top of the intrinsic challenges of text-to-SQL translation makes it hard to reliably interpret user input. We present MUVE (Multiplots for \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2021:CCF, author = "Yinjun Wu and James Weimer and Susan B. Davidson", title = "{CHEF}: a cheap and fast pipeline for iteratively cleaning label uncertainties", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2410--2418", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476290", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476290", abstract = "High-quality labels are expensive to obtain for many machine learning tasks, such as medical image classification tasks. Therefore, probabilistic (weak) labels produced by weak supervision tools are used to seed a process in which influential samples \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Siddiqui:2021:CAG, author = "Tarique Siddiqui and Surajit Chaudhuri and Vivek Narasayya", title = "{COMPARE}: accelerating groupwise comparison in relational databases for data analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2419--2431", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476291", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476291", abstract = "Data analysis often involves comparing subsets of data across many dimensions for finding unusual trends and patterns. While the comparison between subsets of data can be expressed using SQL, they tend to be complex to write, and suffer from poor \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Durner:2021:CUC, author = "Dominik Durner and Badrish Chandramouli and Yinan Li", title = "{Crystal}: a unified cache storage system for analytical databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2432--2444", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476292", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476292", abstract = "Cloud analytical databases employ a disaggregated storage model, where the elastic compute layer accesses data persisted on remote cloud storage in block-oriented columnar formats. Given the high latency and low bandwidth to remote storage and the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cetorelli:2021:SEP, author = "Valerio Cetorelli and Paolo Atzeni and Valter Crescenzi and Franco Milicchio", title = "The smallest extraction problem", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2445--2458", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476293", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476293", abstract = "We introduce landmark grammars, a new family of context-free grammars aimed at describing the HTML source code of pages published by large and templated websites and therefore at effectively tackling Web data extraction problems. Indeed, they address \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Thirumuruganathan:2021:DLB, author = "Saravanan Thirumuruganathan and Han Li and Nan Tang and Mourad Ouzzani and Yash Govind and Derek Paulsen and Glenn Fung and AnHai Doan", title = "Deep learning for blocking in entity matching: a design space exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2459--2472", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476294", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476294", abstract = "Entity matching (EM) finds data instances that refer to the same real-world entity. Most EM solutions perform blocking then matching. Many works have applied deep learning (DL) to matching, but far fewer works have applied DL to blocking. These blocking \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2021:GID, author = "Wentao Zhang and Zhi Yang and Yexin Wang and Yu Shen and Yang Li and Liang Wang and Bin Cui", title = "{GRAIN}: improving data efficiency of {\em gra\/}ph neural networks via diversified {\em in\/}fluence maximization", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2473--2482", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476295", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476295", abstract = "Data selection methods, such as active learning and core-set selection, are useful tools for improving the data efficiency of deep learning models on large-scale datasets. However, recent deep learning models have moved forward from independent and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bandle:2021:DTM, author = "Maximilian Bandle and Jana Giceva", title = "Database technology for the masses: sub-operators as first-class entities", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2483--2490", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476296", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476296", abstract = "A wealth of technology has evolved around relational databases over decades that has been successfully tried and tested in many settings and use cases. Yet, the majority of it remains overlooked in the pursuit of performance (e.g., NoSQL) or new \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gupta:2021:CSL, author = "Pranjal Gupta and Amine Mhedhbi and Semih Salihoglu", title = "Columnar storage and list-based processing for graph database management systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2491--2504", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476297", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476297", abstract = "We revisit column-oriented storage and query processing techniques in the context of contemporary graph database management systems (GDBMSs). Similar to column-oriented RDBMSs, GDBMSs support read-heavy analytical workloads that however have \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2021:PLB, author = "Yiwen Zhu and Matteo Interlandi and Abhishek Roy and Krishnadhan Das and Hiren Patel and Malay Bag and Hitesh Sharma and Alekh Jindal", title = "{Phoebe}: a learning-based checkpoint optimizer", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2505--2518", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476298", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476298", abstract = "Easy-to-use programming interfaces paired with cloud-scale processing engines have enabled big data system users to author arbitrarily complex analytical jobs over massive volumes of data. However, as the complexity and scale of analytical jobs increase,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Nargesian:2021:TDS, author = "Fatemeh Nargesian and Abolfazl Asudeh and H. V. Jagadish", title = "Tailoring data source distributions for fairness-aware data integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2519--2532", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476299", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476299", abstract = "Data scientists often develop data sets for analysis by drawing upon sources of data available to them. A major challenge is to ensure that the data set used for analysis has an appropriate representation of relevant (demographic) groups: it meets \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bansal:2021:MVI, author = "Parikshit Bansal and Prathamesh Deshpande and Sunita Sarawagi", title = "Missing value imputation on multidimensional time series", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2533--2545", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476300", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476300", abstract = "We present DeepMVI, a deep learning method for missing value imputation in multidimensional time-series datasets. Missing values are commonplace in decision support platforms that aggregate data over long time stretches from disparate sources, whereas \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rezig:2021:HSD, author = "El Kindi Rezig and Mourad Ouzzani and Walid G. Aref and Ahmed K. Elmagarmid and Ahmed R. Mahmood and Michael Stonebraker", title = "{Horizon}: scalable dependency-driven data cleaning", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2546--2554", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476301", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476301", abstract = "A large class of data repair algorithms rely on integrity constraints to detect and repair errors. A well-studied class of constraints is Functional Dependencies (FDs, for short). Although there has been an increased interest in developing general data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shaowang:2021:DDS, author = "Ted Shaowang and Nilesh Jain and Dennis D. Matthews and Sanjay Krishnan", title = "Declarative data serving: the future of machine learning inference on the edge", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2555--2562", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476302", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476302", abstract = "Recent advances in computer architecture and networking have ushered in a new age of edge computing, where computation is placed close to the point of data collection to facilitate low-latency decision making. As the complexity of such deployments grow \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2021:APS, author = "Junwen Yang and Yeye He and Surajit Chaudhuri", title = "{Auto-pipeline}: synthesizing complex data pipelines by-target using reinforcement learning and search", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2563--2575", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476303", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476303", abstract = "Recent work has made significant progress in helping users to automate single data preparation steps, such as string-transformations and table-manipulation operators (e.g., Join, GroupBy, Pivot, etc.). We in this work propose to automate multiple such \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lockhart:2021:EIQ, author = "Brandon Lockhart and Jinglin Peng and Weiyuan Wu and Jiannan Wang and Eugene Wu", title = "Explaining inference queries with {Bayesian} optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2576--2585", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476304", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476304", abstract = "Obtaining an explanation for an SQL query result can enrich the analysis experience, reveal data errors, and provide deeper insight into the data. Inference query explanation seeks to explain unexpected aggregate query results on inference data; such \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2021:DBF, author = "Chunwei Liu and Hao Jiang and John Paparrizos and Aaron J. Elmore", title = "Decomposed bounded floats for fast compression and queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2586--2598", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476305", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476305", abstract = "Modern data-intensive applications often generate large amounts of low precision float data with a limited range of values. Despite the prevalence of such data, there is a lack of an effective solution to ingest, store, and analyze bounded, low-precision, numeric data. To address this gap, we propose Buff, a new compression technique that uses a decomposed columnar storage and encoding methods to provide effective compression, fast ingestion, and high-speed in-situ adaptive query operators with SIMD support.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tziavelis:2021:BEJ, author = "Nikolaos Tziavelis and Wolfgang Gatterbauer and Mirek Riedewald", title = "Beyond equi-joins: ranking, enumeration and factorization", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2599--2612", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476306", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476306", abstract = "We study theta-joins in general and join predicates with conjunctions and disjunctions of inequalities in particular, focusing on ranked enumeration where the answers are returned incrementally in an order dictated by a given ranking function. Our \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jacob:2021:EBE, author = "Vincent Jacob and Fei Song and Arnaud Stiegler and Bijan Rad and Yanlei Diao and Nesime Tatbul", title = "{Exathlon}: a benchmark for explainable anomaly detection over time series", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2613--2626", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476307", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476307", abstract = "Access to high-quality data repositories and benchmarks have been instrumental in advancing the state of the art in many experimental research domains. While advanced analytics tasks over time series data have been gaining lots of attention, lack of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kuchnik:2021:PCR, author = "Michael Kuchnik and George Amvrosiadis and Virginia Smith", title = "Progressive compressed records: taking a byte out of deep learning data", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2627--2641", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476308", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476308", abstract = "Deep learning accelerators efficiently train over vast and growing amounts of data, placing a newfound burden on commodity networks and storage devices. A common approach to conserve bandwidth involves resizing or compressing data prior to training. We \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Alsaudi:2021:TFQ, author = "Abdulrahman Alsaudi and Yasser Altowim and Sharad Mehrotra and Yaming Yu", title = "{TQEL}: framework for query-driven linking of top-$k$ entities in social media blogs", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "11", pages = "2642--2654", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476249.3476309", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 18:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476249.3476309", abstract = "Social media analysis over blogs (such as tweets) often requires determining top-k mentions of a certain category (e.g., movies) in a collection (e.g., tweets collected over a given day). Such queries require entity linking (EL) function to be executed \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chan:2021:KEN, author = "Tsz Nam Chan and Pak Lon Ip and Leong Hou U. and Weng Hou Tong and Shivansh Mittal and Ye Li and Reynold Cheng", title = "{KDV-explorer}: a near real-time kernel density visualization system for spatial analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2655--2658", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476312", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476312", abstract = "Kernel density visualization (KDV) is a commonly used visualization tool for many spatial analysis tasks, including disease outbreak detection, crime hotspot detection, and traffic accident hotspot detection. Although the most popular geographical \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2021:RRI, author = "Zhebin Zhang and Dajie Dong and Yuhang Ma and Yilong Ying and Dawei Jiang and Ke Chen and Lidan Shou and Gang Chen", title = "{Refiner}: a reliable incentive-driven federated learning system powered by blockchain", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2659--2662", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476313", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476313", abstract = "Modern mobile applications often produce decentralized data, i.e., a huge amount of privacy-sensitive data distributed over a large number of mobile devices. Techniques for learning models from decentralized data must properly handle two natures of such \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Uotila:2021:MMM, author = "Valter Uotila and Jiaheng Lu and Dieter Gawlick and Zhen Hua Liu and Souripriya Das and Gregory Pogossiants", title = "{MultiCategory}: multi-model query processing meets category theory and functional programming", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2663--2666", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476314", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476314", abstract = "The variety of data is one of the important issues in the era of Big Data. The data are naturally organized in different formats and models, including structured data, semi-structured data, and unstructured data. Prior research has envisioned an \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2021:CCQ, author = "Qichen Wang and Chaoqi Zhang and Danish Alsayed and Ke Yi and Bin Wu and Feifei Li and Chaoqun Zhan", title = "{Cquirrel}: continuous query processing over acyclic relational schemas", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2667--2670", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476315", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476315", abstract = "We will demonstrate Cquirrel, a continuous query processing engine built on top of Flink. Cquirrel assumes a relational schema where the foreign-key constraints form a directed acyclic graph, and supports any selection-projection-join-aggregation query \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mao:2021:DDF, author = "Yuetian Mao and Shuai Yuan and Nan Cui and Tianjiao Du and Beijun Shen and Yuting Chen", title = "{DeFiHap}: detecting and fixing {HiveQL} anti-patterns", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2671--2674", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476316", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476316", abstract = "The emergence of Hive greatly facilitates the management of massive data stored in various places. Meanwhile, data scientists face challenges during HiveQL programming --- they may not use correct and/or efficient HiveQL statements in their programs; \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Helal:2021:DKD, author = "Ahmed Helal and Mossad Helali and Khaled Ammar and Essam Mansour", title = "A demonstration of {KGLac}: a data discovery and enrichment platform for data science", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2675--2678", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476317", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476317", abstract = "Data science growing success relies on knowing where a relevant dataset exists, understanding its impact on a specific task, finding ways to enrich a dataset, and leveraging insights derived from it. With the growth of open data initiatives, data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Faure-Giovagnoli:2021:AVL, author = "Pierre Faure-Giovagnoli and Marie {Le Guilly} and Jean-Marc Petit and Vasile-Marian Scuturici", title = "{ADESIT}: visualize the limits of your data in a machine learning process", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2679--2682", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476318", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476318", abstract = "Thanks to the numerous machine learning tools available to us nowadays, it is easier than ever to derive a model from a dataset in the frame of a supervised learning problem. However, when this model behaves poorly compared with an expected performance, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yan:2021:PAM, author = "Yinzhao Yan and Raymond Chi-Wing Wong", title = "Path advisor: a multi-functional campus map tool for shortest path", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2683--2686", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476319", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476319", abstract = "The shortest path in both the two dimensional (2D) plane and the three dimensional (3D) terrain is extensively used both in industry and academia. Although there are some map visualization tools for viewing the shortest path in 2D and 3D views, we find \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2021:IHL, author = "Liangde Li and Supun Nakandala and Arun Kumar", title = "Intermittent human-in-the-loop model selection using {Cerebro}: a demonstration", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2687--2690", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476320", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476320", abstract = "Deep learning (DL) is revolutionizing many fields. However, there is a major bottleneck for the wide adoption of DL: the pain of model selection, which requires exploring a large config space of model architecture and training hyper-parameters before \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Funke:2021:LLC, author = "Henning Funke and Jens Teubner", title = "Low-latency compilation of {SQL} queries to machine code", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2691--2694", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476321", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476321", abstract = "Query compilation has proven to be one of the most efficient query processing techniques. Despite its fast processing speed, the additional compilation times of the technique limit its applicability. This is because the approach is most beneficial only \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Groppe:2021:SDS, author = "Sven Groppe and Rico Klinckenberg and Benjamin Warnke", title = "Sound of databases: sonification of a semantic web database engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2695--2698", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476322", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476322", abstract = "Sonifications map data to auditory dimensions and offer a new audible experience to their listeners. We propose a sonification of query processing paired with a corresponding visualization both integrated in a web application. In this demonstration we \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2021:HHM, author = "Zihao Chen and Zhizhen Xu and Chen Xu and Juan Soto and Volker Markl and Weining Qian and Aoying Zhou", title = "{HyMAC}: a hybrid matrix computation system", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2699--2702", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476323", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476323", abstract = "Distributed matrix computation is common in large-scale data processing and machine learning applications. Iterative-convergent algorithms involving matrix computation share a common property: parameters converge non-uniformly. This property can be \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xu:2021:GOS, author = "Jingbo Xu and Zhanning Bai and Wenfei Fan and Longbin Lai and Xue Li and Zhao Li and Zhengping Qian and Lei Wang and Lei Wang and Yanyan Wang and Wenyuan Yu and Jingren Zhou", title = "{GraphScope}: a one-stop large graph processing system", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2703--2706", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476324", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476324", abstract = "Due to diverse graph data and algorithms, programming and orchestration of complex computation pipelines have become the major challenges to making use of graph applications for Web-scale data analysis. GraphScope aims to provide a one-stop and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Renz-Wieland:2021:JMI, author = "Alexander Renz-Wieland and Tobias Drobisch and Zoi Kaoudi and Rainer Gemulla and Volker Markl", title = "Just move it!: dynamic parameter allocation in action", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2707--2710", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476325", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476325", abstract = "Parameter servers (PSs) ease the implementation of distributed machine learning systems, but their performance can fall behind that of single machine baselines due to communication overhead. We demonstrate Lapse, an open source PS with dynamic parameter \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Orogat:2021:CDC, author = "Abdelghny Orogat and Ahmed El-Roby", title = "{CBench}: demonstrating comprehensive evaluation of question answering systems over knowledge graphs through deep analysis of benchmarks", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2711--2714", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476326", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476326", abstract = "A plethora of question answering (QA) systems that retrieve answers to natural language questions from knowledge graphs have been developed in recent years. However, choosing a benchmark to accurately assess the quality of a question answering system is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Woltmann:2021:PPM, author = "Lucas Woltmann and Dominik Olwig and Claudio Hartmann and Dirk Habich and Wolfgang Lehner", title = "{PostCENN}: {postgreSQL} with machine learning models for cardinality estimation", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2715--2718", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476327", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476327", abstract = "In this demo, we present PostCENN, an enhanced PostgreSQL database system with an end-to-end integration of machine learning (ML) models for cardinality estimation. In general, cardinality estimation is a topic with a long history in the database \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2021:DDU, author = "Jinyang Li and Yuval Moskovitch and H. V. Jagadish", title = "{DENOUNCER}: detection of unfairness in classifiers", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2719--2722", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476328", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476328", abstract = "The use of automated data-driven tools for decision-making has gained popularity in recent years. At the same time, the reported cases of algorithmic bias and discrimination increase as well, which in turn lead to an extensive study of algorithmic \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Abbar:2021:DQM, author = "Sofiane Abbar and Rade Stanojevic and Mashaal Musleh and Mohamed ElShrif and Mohamed Mokbel", title = "A demonstration of {QARTA}: an {ML}-based system for accurate map services", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2723--2726", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476329", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476329", abstract = "This demo presents QARTA; an open-source full-fledged system for highly accurate and scalable map services. QARTA employs machine learning techniques to: (a) construct its own highly accurate map in terms of both map topology and edge weights, and (b) \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Smith:2021:TTN, author = "Jaclyn Smith and Michael Benedikt and Brandon Moore and Milos Nikolic", title = "{TraNCE}: transforming nested collections efficiently", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2727--2730", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476330", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476330", abstract = "Nested relational query languages have long been seen as an attractive tool for scenarios involving large hierarchical datasets. There has been a resurgence of interest in nested relational languages. One driver has been the affinity of these languages \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Diestelkamper:2021:DMA, author = "Ralf Diestelk{\"a}mper and Seokki Lee and Boris Glavic and Melanie Herschel", title = "Debugging missing answers for spark queries over nested data with {Breadcrumb}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2731--2734", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476331", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476331", abstract = "We present Breadcrumb, a system that aids developers in debugging queries through query-based explanations for missing answers. Given as input a query and an expected, but missing, query result, Breadcrumb identifies operators in the input query that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2021:DPW, author = "Renzhi Wu and Prem Sakala and Peng Li and Xu Chu and Yeye He", title = "Demonstration of {Panda}: a weakly supervised entity matching system", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2735--2738", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476332", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476332", abstract = "Entity matching (EM) refers to the problem of identifying tuple pairs in one or more relations that refer to the same real world entities. Supervised machine learning (ML) approaches, and deep learning based approaches in particular, typically achieve \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2021:ADAb, author = "Jiabin Liu and Fu Zhu and Chengliang Chai and Yuyu Luo and Nan Tang", title = "Automatic data acquisition for deep learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2739--2742", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476333", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476333", abstract = "Deep learning (DL) has widespread applications and has revolutionized many industries. Although automated machine learning (AutoML) can help us away from coding for DL models, the acquisition of lots of high-quality data for model training remains a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2021:DSD, author = "Xuanhe Zhou and Lianyuan Jin and Ji Sun and Xinyang Zhao and Xiang Yu and Jianhua Feng and Shifu Li and Tianqing Wang and Kun Li and Luyang Liu", title = "{DBMind}: a self-driving platform in {openGauss}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2743--2746", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476334", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476334", abstract = "We demonstrate a self-driving system DBMind, which provides three autonomous capabilities in database, including self-monitoring, self-diagnosis and self-optimization. First, self-monitoring judiciously collects database metrics and detects anomalies \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2021:DDE, author = "Qiongqiong Lin and Jiayao Zhang and Jinfei Liu and Kui Ren and Jian Lou and Junxu Liu and Li Xiong and Jian Pei and Jimeng Sun", title = "Demonstration of dealer: an end-to-end model marketplace with differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2747--2750", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476335", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476335", abstract = "Data-driven machine learning (ML) has witnessed great success across a variety of application domains. Since ML model training relies on a large amount of data, there is a growing demand for high-quality data to be collected for ML model training. Data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mu:2021:AAC, author = "Tianyu Mu and Hongzhi Wang and Shenghe Zheng and Shaoqing Zhang and Cheng Liang and Haoyun Tang", title = "{Assassin}: an automatic classification system based on algorithm selection", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2751--2754", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476336", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476336", abstract = "The increasing complexity of data analysis tasks makes it dependent on human expertise and challenging for non-experts. One of the major challenges faced in data analysis is the selection of the proper algorithm for given tasks and data sets. Motivated \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cao:2021:AMD, author = "Lei Cao and Dongqing Xiao and Yizhou Yan and Samuel Madden and Guoliang Li", title = "{ATLANTIC}: making database differentially private and faster with accuracy guarantee", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2755--2758", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476337", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476337", abstract = "Differential privacy promises to enable data sharing and general data analytics while protecting individual privacy. Because the private data is often stored in the form of relational database that supports SQL queries, making SQL-based analytics \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xie:2021:DMS, author = "Anze Xie and Anders Carlsson and Jason Mohoney and Roger Waleffe and Shanan Peters and Theodoros Rekatsinas and Shivaram Venkataraman", title = "Demo of {Marius}: a system for large-scale graph embeddings", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2759--2762", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476338", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476338", abstract = "Graph embeddings have emerged as the de facto representation for modern machine learning over graph data structures. The goal of graph embedding models is to convert high-dimensional sparse graphs into low-dimensional, dense and continuous vector spaces \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Muller:2021:PPO, author = "Heiko M{\"u}ller and Sonia Castelo and Munaf Qazi and Juliana Freire", title = "From papers to practice: the \pkg{openclean} open-source data cleaning library", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2763--2766", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476339", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476339", abstract = "Data preparation is still a major bottleneck for many data science projects. Even though many sophisticated algorithms and tools have been proposed in the research literature, it is difficult for practitioners to integrate them into their data wrangling \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ge:2021:DAD, author = "Yongming Ge and Vanessa Lin and Maureen Daum and Brandon Haynes and Alvin Cheung and Magdalena Balazinska", title = "Demonstration of apperception: a database management system for geospatial video data", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2767--2770", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476340", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476340", abstract = "Many recent video applications---including traffic monitoring, drone analytics, autonomous driving, and virtual reality---require piecing together, combining, and operating over many related video streams. Despite the massive data volumes involved and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Karatzoglidi:2021:AEC, author = "Mary Karatzoglidi and Paraskevas Kerasiotis and Verena Kantere", title = "Automated energy consumption forecasting with {EnForce}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2771--2774", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476341", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476341", abstract = "The need to reduce energy consumption on a global scale has been of high importance during the last years. Research has created methods to make highly accurate forecasts on the energy consumption of buildings and there have been efforts towards the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jang:2021:RWG, author = "Myung-Hwan Jang and Yong-Yeon Jo and Sang-Wook Kim", title = "{RealGraph} web: a graph analysis platform on the web", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2775--2778", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476342", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476342", abstract = "In this demo, we present RealGraph$^{Web}$, a web-based platform that provides various kinds of graph analysis services. RealGraph$^{Web}$ is based on RealGraph, a graph engine that addresses the problem of performance degradation in processing real-world big \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ghosh:2021:IDS, author = "Arthita Ghosh and Deven Bansod and Arpit Narechania and Prashanth Dintyala and Su Timurturkan and Joy Arulraj", title = "Interactive demonstration of {SQLCheck}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2779--2782", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476343", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476343", abstract = "We will demonstrate a prototype of sqlcheck, a holistic toolchain for automatically finding and fixing anti-patterns in database applications. The advent of modern database-as-a-service platforms has made it easy for developers to quickly create \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2021:CET, author = "Yiming Lin and Pramod Khargonekar and Sharad Mehrotra and Nalini Venkatasubramanian", title = "{T-cove}: an exposure tracing system based on cleaning wi-fi events on organizational premises", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2783--2786", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476344", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476344", abstract = "WiFi connectivity events, generated when a mobile device connects to WiFi access points can serve as a robust, passive, (almost) zero-cost indoor localization technology. The challenge is the coarse level localization it offers that limits its \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2021:DGE, author = "Paul Y. Wang and Sainyam Galhotra and Romila Pradhan and Babak Salimi", title = "Demonstration of generating explanations for black-box algorithms using {Lewis}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2787--2790", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476345", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476345", abstract = "Explainable artificial intelligence (XAI) aims to reduce the opacity of AI-based decision-making systems, allowing humans to scrutinize and trust them. Unlike prior work that attributes the responsibility for an algorithm's decisions to its inputs as a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Castelo:2021:ADS, author = "Sonia Castelo and R{\'e}mi Rampin and A{\'e}cio Santos and Aline Bessa and Fernando Chirigati and Juliana Freire", title = "{Auctus}: a dataset search engine for data discovery and augmentation", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2791--2794", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476346", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476346", abstract = "The large volumes of structured data currently available, from Web tables to open-data portals and enterprise data, open up new opportunities for progress in answering many important scientific, societal, and business questions. However, finding \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rehman:2021:DRS, author = "Mohammed Suhail Rehman and Silu Huang and Aaron J. Elmore", title = "A demonstration of {RELIC}: a system for retrospective lineage inference of data workflows", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2795--2798", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476347", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476347", abstract = "The ad-hoc, heterogeneous process of modern data science typically involves loading, cleaning, and mutating dataset(s) into multiple versions recorded as artifacts by various tools within a single data science workflow. Lineage information, including \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2021:SSC, author = "Zhihao Chen and Haizhen Zhuo and Quanqing Xu and Xiaodong Qi and Chengyu Zhu and Zhao Zhang and Cheqing Jin and Aoying Zhou and Ying Yan and Hui Zhang", title = "{SChain}: a scalable consortium blockchain exploiting intra- and inter-block concurrency", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2799--2802", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476348", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476348", abstract = "We demonstrate SChain, a consortium blockchain that scales transaction processing to support large-scale enterprise applications. The unique advantage of SChain stems from the exploitation of both intra- and inter-block concurrency. The intra-block \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Anastasiou:2021:EEP, author = "Chrysovalantis Anastasiou and Constantinos Costa and Panos K. Chrysanthis and Cyrus Shahabi", title = "{EPICGen}: an experimental platform for indoor congestion generation and forecasting", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2803--2806", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476349", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476349", abstract = "Effectively and accurately forecasting the congestion in indoor spaces has become particularly important during the pandemic in order to reduce the risk of exposure to airborne viruses. However, there is a lack of readily available indoor congestion \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Arnaout:2021:WKB, author = "Hiba Arnaout and Simon Razniewski and Gerhard Weikum and Jeff Z. Pan", title = "{Wikinegata}: a knowledge base with interesting negative statements", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2807--2810", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476350", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476350", abstract = "Databases about general-world knowledge, so-called knowledge bases (KBs), are important in applications such as search and question answering. Traditionally, although KBs use open world assumption, popular KBs only store positive information, but \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2021:FEE, author = "Jinwei Zhu and Kun Cheng and Jiayang Liu and Liang Guo", title = "Full encryption: an end to end encryption mechanism in {GaussDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2811--2814", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476351", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476351", abstract = "In this paper, we present a novel mechanism called Full Encryption (FE) in GaussDB. FE-in-GaussDB provides column-level encryption for sensitive data, and secures the asset from any malicious cloud administrator or information leakage attack. It ensures \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mandamadiotis:2021:DIA, author = "Antonis Mandamadiotis and Stavroula Eleftherakis and Apostolos Glenis and Dimitrios Skoutas and Yannis Stavrakas and Georgia Koutrika", title = "{DatAgent}: the imminent age of intelligent data assistants", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2815--2818", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476352", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476352", abstract = "In this demonstration, we present DatAgent, an intelligent data assistant system that allows users to ask queries in natural language, and can respond in natural language as well. Moreover, the system actively guides the user using different types of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rezig:2021:DDD, author = "El Kindi Rezig and Anshul Bhandari and Anna Fariha and Benjamin Price and Allan Vanterpool and Vijay Gadepally and Michael Stonebraker", title = "{DICE}: data discovery by example", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2819--2822", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476353", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476353", abstract = "In order to conduct analytical tasks, data scientists often need to find relevant data from an avalanche of sources (e.g., data lakes, large organizational databases). This effort is typically made in an ad hoc, non-systematic manner, which makes it a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schuhknecht:2021:AAP, author = "Felix Schuhknecht and Aaron Priesterroth and Justus Henneberg and Reza Salkhordeh", title = "{AnyOLAP}: analytical processing of arbitrary data-intensive applications without {ETL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2823--2826", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476354", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476354", abstract = "The volume of data that is processed and produced by modern data-intensive applications is constantly increasing. Of course, along with the volume, the interest in analyzing and interpreting this data increases as well. As a consequence, more and more \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jacob:2021:DEB, author = "Vincent Jacob and Fei Song and Arnaud Stiegler and Bijan Rad and Yanlei Diao and Nesime Tatbul", title = "A demonstration of the {Exathlon} benchmarking platform for explainable anomaly detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2827--2830", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476355", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476355", abstract = "In this demo, we introduce Exathlon --- a new benchmarking platform for explainable anomaly detection over high-dimensional time series. We designed Exathlon to support data scientists and researchers in developing and evaluating learned models and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shaikhha:2021:IRH, author = "Amir Shaikhha and Maximilian Schleich and Dan Olteanu", title = "An intermediate representation for hybrid database and machine learning workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2831--2834", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476356", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476356", abstract = "IFAQ is an intermediate representation and compilation framework for hybrid database and machine learning workloads expressible using iterative programs with functional aggregate queries. We demonstrate IFAQ for several OLAP queries, linear algebra \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pastor:2021:HDY, author = "Eliana Pastor and Andrew Gavgavian and Elena Baralis and Luca de Alfaro", title = "How divergent is your data?", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2835--2838", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476357", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476357", abstract = "We present DivExplorer, a tool that enables users to explore datasets and find subgroups of data for which a classifier behaves in an anomalous manner. These subgroups, denoted as divergent subgroups, may exhibit, for example, higher-than-normal false \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Berro:2021:ERP, author = "Auday Berro and Mohammad-Ali Yaghub Zade Fard and Marcos Baez and Boualem Benatallah and Khalid Benabdeslem", title = "An extensible and reusable pipeline for automated utterance paraphrases", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2839--2842", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476358", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476358", abstract = "In this demonstration paper we showcase an extensible and reusable pipeline for automatic paraphrase generation, i.e., reformulating sentences using different words. Capturing the nuances of human language is fundamental to the effectiveness of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Beedkar:2021:CGD, author = "Kaustubh Beedkar and David Brekardin and Jorge-Anulfo Quian{\'e}-Ruiz and Volker Markl", title = "Compliant geo-distributed data processing in action", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2843--2846", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476359", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476359", abstract = "In this paper we present our work on compliant geo-distributed data processing. Our work focuses on the new dimension of dataflow constraints that regulate the movement of data across geographical or institutional borders. For example, European \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yadav:2021:QDV, author = "Piyush Yadav and Dhaval Salwala and Felipe Arruda Pontes and Praneet Dhingra and Edward Curry", title = "Query-driven video event processing for the {Internet of Multimedia Things}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2847--2850", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476360", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476360", abstract = "Advances in Deep Neural Network (DNN) techniques have revolutionized video analytics and unlocked the potential for querying and mining video event patterns. This paper details GNOSIS, an event processing platform to perform near-real-time video event \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Koutroumanis:2021:DNU, author = "Nikolaos Koutroumanis and Nikolaos Kousathanas and Christos Doulkeridis and Akrivi Vlachou", title = "A demonstration of {NoDA}: unified access to {NoSQL} stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2851--2854", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476361", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476361", abstract = "In this demo paper, we present a system prototype, called NoDA, that unifies access to NoSQL stores, by exposing a single interface to big data developers. This hides the heterogeneity of NoSQL stores, in terms of different query languages, non-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sen:2021:APP, author = "Rathijit Sen and Abhishek Roy and Alekh Jindal and Rui Fang and Jeff Zheng and Xiaolei Liu and Ruiping Li", title = "{AutoExecutor}: predictive parallelism for {Spark SQL} queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2855--2858", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476362", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476362", abstract = "Right-sizing resources for query execution is important for cost-efficient performance, but estimating how performance is affected by resource allocations, upfront, before query execution is difficult. We demonstrate AutoExecutor, a predictive system \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2021:CBA, author = "Jiaxiang Liu and Karl Knopf and Yiqing Tan and Bolin Ding and Xi He", title = "Catch a blowfish alive: a demonstration of policy-aware differential privacy for interactive data exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2859--2862", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476363", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476363", abstract = "Policy-aware differential privacy (DP) frameworks such as Blowfish privacy enable more accurate query answers than standard DP. In this work, we build the first policy-aware DP system for interactive data exploration, BlowfishDB, that aims to (i) \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ouellette:2021:RDL, author = "Paul Ouellette and Aidan Sciortino and Fatemeh Nargesian and Bahar Ghadiri Bashardoost and Erkang Zhu and Ken Q. Pu and Ren{\'e}e J. Miller", title = "{RONIN}: data lake exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2863--2866", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476364", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476364", abstract = "Dataset discovery can be performed using search (with a query or keywords) to find relevant data. However, the result of this discovery can be overwhelming to explore. Existing navigation techniques mostly focus on linkage graphs that enable navigation \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Boniol:2021:SAS, author = "Paul Boniol and John Paparrizos and Themis Palpanas and Michael J. Franklin", title = "{SAND} in action: subsequence anomaly detection for streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2867--2870", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476365", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476365", abstract = "Subsequence anomaly detection in long data series is a significant problem. While the demand for real-time analytics and decision making increases, anomaly detection methods have to operate over streams and handle drifts in data distribution. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Koutras:2021:VAM, author = "Christos Koutras and Kyriakos Psarakis and George Siachamis and Andra Ionescu and Marios Fragkoulis and Angela Bonifati and Asterios Katsifodimos", title = "{Valentine} in action: matching tabular data at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2871--2874", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476366", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476366", abstract = "Capturing relationships among heterogeneous datasets in large data lakes --- traditionally termed schema matching --- is one of the most challenging problems that corporations and institutions face nowadays. Discovering and integrating datasets heavily \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Guan:2021:GDE, author = "Sheng Guan and Hanchao Ma and Sutanay Choudhury and Yinghui Wu", title = "{GEDet}: detecting erroneous nodes with a few examples", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2875--2878", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476367", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476367", abstract = "Detecting nodes with erroneous values in real-world graphs remains challenging due to the lack of examples and various error scenarios. We demonstrate GEDet, an error detection engine that can detect erroneous nodes in graphs with a few examples. The \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2021:GUE, author = "Wenfei Fan and Tao He and Longbin Lai and Xue Li and Yong Li and Zhao Li and Zhengping Qian and Chao Tian and Lei Wang and Jingbo Xu and Youyang Yao and Qiang Yin and Wenyuan Yu and Jingren Zhou and Diwen Zhu and Rong Zhu", title = "{GraphScope}: a unified engine for big graph processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2879--2892", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476369", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476369", abstract = "GraphScope is a system and a set of language extensions that enable a new programming interface for large-scale distributed graph computing. It generalizes previous graph processing frameworks (e.g., Pregel, GraphX) and distributed graph databases (e.g.,. \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shang:2021:DSI, author = "Zeyuan Shang and Emanuel Zgraggen and Benedetto Buratti and Philipp Eichmann and Navid Karimeddiny and Charlie Meyer and Wesley Runnels and Tim Kraska", title = "{Davos}: a system for interactive data-driven decision making", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2893--2905", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476370", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476370", abstract = "Recently, a new horizon in data analytics, prescriptive analytics, is becoming more and more important to make data-driven decisions. As opposed to the progress of democratizing data acquisition and access, making data-driven decisions remains a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qin:2021:MEU, author = "An Qin and Mengbai Xiao and Yongwei Wu and Xinjie Huang and Xiaodong Zhang", title = "{Mixer}: efficiently understanding and retrieving visual content at web-scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2906--2917", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476371", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476371", abstract = "Visual contents, including images and videos, are dominant on the Internet today. The conventional search engine is mainly designed for textual documents, which must be extended to process and manage increasingly high volumes of visual data objects. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Justo:2021:TPF, author = "David Justo and Shaoqing Yi and Lukas Stadler and Nadia Polikarpova and Arun Kumar", title = "Towards a polyglot framework for factorized {ML}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2918--2931", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476372", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476372", abstract = "Optimizing machine learning (ML) workloads on structured data is a key concern for data platforms. One class of optimizations called ``factorized ML'' helps reduce ML runtimes over multi-table datasets by pushing ML computations down through joins, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dayan:2021:EML, author = "Niv Dayan and Moshe Twitto and Yuval Rochman and Uri Beitler and Itai {Ben Zion} and Edward Bortnikov and Shmuel Dashevsky and Ofer Frishman and Evgeni Ginzburg and Igal Maly and Avraham (Poza) Meir and Mark Mokryn and Iddo Naiss and Noam Rabinovich", title = "The end of {Moore}'s law and the rise of the data processor", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2932--2944", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476373", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476373", abstract = "With the end of Moore's Law, database architects are turning to hardware accelerators to offload computationally intensive tasks from the CPU. In this paper, we show that accelerators can facilitate far more than just computation: they enable algorithms and data structures that lavishly expand computation in order to optimize for disparate cost metrics. We introduce the Pliops Extreme Data Processor (XDP), a novel storage engine implemented from the ground up using customized hardware. At its core, XDP consists of an accelerated hash table to index the data in storage using less memory and fewer storage accesses for queries than the best alternative. XDP also employs an accelerated compressor, a capacitor, and a lock-free RAID sub-system to minimize storage space and recovery time while minimizing performance penalties. As a result, XDP overcomes cost contentions that have so far been inescapable.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Murray:2021:TDM, author = "Derek G. Murray and Jir{\'\i} Simsa and Ana Klimovic and Ihor Indyk", title = "\pkg{tf.data}: a machine learning data processing framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2945--2958", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476374", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476374", abstract = "Training machine learning models requires feeding input data for models to ingest. Input pipelines for machine learning jobs are often challenging to implement efficiently as they require reading large volumes of data, applying complex transformations, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Eltabakh:2021:BBA, author = "Mohamed Eltabakh and Anantha Subramanian and Awny Al-Omari and Mohammed Al-Kateb and Sanjay Nair and Mahbub Hasan and Wellington Cabrera and Charles Zhang and Amit Kishore and Snigdha Prasad", title = "Not black-box anymore!: enabling analytics-aware optimizations in {Teradata Vantage}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2959--2971", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476375", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476375", abstract = "Teradata Vantage is a platform for integrating a broad range of analytical functions and capabilities with the Teradata's SQL engine. One of the main challenges in optimizing the execution of these analytical functions is that many of them are not only \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2021:FAE, author = "Yingda Chen and Jiamang Wang and Yifeng Lu and Ying Han and Zhiqiang Lv and Xuebin Min and Hua Cai and Wei Zhang and Haochuan Fan and Chao Li and Tao Guan and Wei Lin and Yangqing Jia and Jingren Zhou", title = "{Fangorn}: adaptive execution framework for heterogeneous workloads on shared clusters", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2972--2985", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476376", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476376", abstract = "Pervasive needs for data explorations at all scales have populated modern distributed platforms with workloads of different characteristics. The growing complexities and diversities have thereafter imposed distinct challenges to execute them on shared \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Agiwal:2021:NPS, author = "Ankur Agiwal and Kevin Lai and Gokul Nath Babu Manoharan and Indrajit Roy and Jagan Sankaranarayanan and Hao Zhang and Tao Zou and Min Chen and Zongchang (Jim) Chen and Ming Dai and Thanh Do and Haoyu Gao and Haoyan Geng and Raman Grover and Bo Huang and Yanlai Huang and Zhi (Adam) Li and Jianyi Liang and Tao Lin and Li Liu and Yao Liu and Xi Mao and Yalan (Maya) Meng and Prashant Mishra and Jay Patel and Rajesh S. R. and Vijayshankar Raman and Sourashis Roy and Mayank Singh Shishodia and Tianhang Sun and Ye (Justin) Tang and Junichi Tatemura and Sagar Trehan and Ramkumar Vadali and Prasanna Venkatasubramanian and Gensheng Zhang and Kefei Zhang and Yupu Zhang and Zeleng Zhuang and Goetz Graefe and Divyakant Agrawal and Jeff Naughton and Sujata Kosalge and Hakan Hac{\i}g{\"u}m{\"u}{\c{s}}", title = "{Napa}: powering scalable data warehousing with robust query performance at {Google}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2986--2997", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476377", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476377", abstract = "Google services continuously generate vast amounts of application data. This data provides valuable insights to business users. We need to store and serve these planet-scale data sets under the extremely demanding requirements of scalability, sub-second \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lee:2021:ABR, author = "Rubao Lee and Minghong Zhou and Chi Li and Shenggang Hu and Jianping Teng and Dongyang Li and Xiaodong Zhang", title = "The art of balance: a {RateupDBTM} experience of building a {CPU\slash GPU} hybrid database product", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "2999--3013", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476378", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476378", abstract = "GPU-accelerated database systems have been studied for more than 10 years, ranging from prototyping development to industry products serving in multiple domains of data applications. Existing GPU database research solutions are often focused on specific \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cheng:2021:RTL, author = "Audrey Cheng and Xiao Shi and Lu Pan and Anthony Simpson and Neil Wheaton and Shilpa Lawande and Nathan Bronson and Peter Bailis and Natacha Crooks and Ion Stoica", title = "{RAMP-TAO}: layering atomic transactions on {Facebook}'s online {TAO} data store", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3014--3027", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476379", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476379", abstract = "Facebook's graph store TAO, like many other distributed data stores, traditionally prioritizes availability, efficiency, and scalability over strong consistency or isolation guarantees to serve its large, read-dominant workloads. As product developers \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2021:OAD, author = "Guoliang Li and Xuanhe Zhou and Ji Sun and Xiang Yu and Yue Han and Lianyuan Jin and Wenbo Li and Tianqing Wang and Shifu Li", title = "{openGauss}: an autonomous database system", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3028--3042", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476380", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476380", abstract = "Although learning-based database optimization techniques have been studied from academia in recent years, they have not been widely deployed in commercial database systems. In this work, we build an autonomous database framework and integrate our \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Potharaju:2021:HIS, author = "Rahul Potharaju and Terry Kim and Eunjin Song and Wentao Wu and Lev Novik and Apoorve Dave and Andrew Fogarty and Pouria Pirzadeh and Vidip Acharya and Gurleen Dhody and Jiying Li and Sinduja Ramanujam and Nicolas Bruno and C{\'e}sar A. Galindo-Legaria and Vivek Narasayya and Surajit Chaudhuri and Anil K. Nori and Tomas Talius and Raghu Ramakrishnan", title = "{Hyperspace}: the indexing subsystem of {Azure Synapse}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3043--3055", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476382", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476382", abstract = "Microsoft recently introduced Azure Synapse Analytics, which offers an integrated experience across data ingestion, storage, and querying in Apache Spark and T-SQL over data in the lake, including files and warehouse tables. In this paper, we present \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zheng:2021:SVB, author = "Bolong Zheng and Lei Bi and Juan Cao and Hua Chai and Jun Fang and Lu Chen and Yunjun Gao and Xiaofang Zhou and Christian S. Jensen", title = "{SpeakNav}: voice-based route description language understanding for template-driven path search", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3056--3068", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476383", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476383", abstract = "Many navigation applications take natural language speech as input, which avoids users typing in words and thus improves traffic safety. However, navigation applications often fail to understand a user's free-form description of a route. In addition, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gomes:2021:RML, author = "Ana Sofia Gomes and Jo{\~a}o Oliveirinha and Pedro Cardoso and Pedro Bizarro", title = "{Railgun}: managing large streaming windows under {MAD} requirements", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3069--3082", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476384", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476384", abstract = "Some mission critical systems, e.g., fraud detection, require accurate, real-time metrics over long time sliding windows on applications that demand high throughput and low latencies. As these applications need to run ``forever'' and cope with large, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Edara:2021:BMW, author = "Pavan Edara and Mosha Pasumansky", title = "Big metadata: when metadata is big data", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3083--3095", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476385", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476385", abstract = "The rapid emergence of cloud data warehouses like Google BigQuery has redefined the landscape of data analytics. With the growth of data volumes, such systems need to scale to hundreds of EiB of data in the near future. This growth is accompanied by an \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Stoddard:2021:TRF, author = "Josh Stoddard and Adam Mustafa and Naveen Goela", title = "{Tanium Reveal}: a federated search engine for querying unstructured file data on large enterprise networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3096--3109", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476386", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476386", abstract = "Tanium Reveal is a federated search engine deployed on large-scale enterprise networks that is capable of executing data queries across billions of private data files within 60 seconds. Data resides at the edge of networks, potentially distributed on \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gencer:2021:HJL, author = "Can Gencer and Marko Topolnik and Viliam {\v{D}}urina and Emin Demirci and Ensar B. Kahveci and Ali G{\"u}rb{\"u}z and Ond{\v{r}}ej Luk{\'a}{\v{s}} and J{\'o}zsef Bart{\'o}k and Grzegorz Gierlach and Franti{\v{s}}ek Hartman and Ufuk Y{\i}lmaz and Mehmet Do{\u{g}}an and Mohamed Mandouh and Marios Fragkoulis and Asterios Katsifodimos", title = "{Hazelcast Jet}: low-latency stream processing at the 99.99-th percentile", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3110--3121", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476387", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476387", abstract = "Jet is an open source, high performance, distributed stream processor built at Hazelcast during the last five years. Jet was engineered with millisecond latency on the 99.99th percentile as its primary design goal. Originally Jet's purpose was to be an \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Roy:2021:SWO, author = "Abhishek Roy and Alekh Jindal and Priyanka Gomatam and Xiating Ouyang and Ashit Gosalia and Nishkam Ravi and Swinky Mann and Prakhar Jain", title = "{SparkCruise}: workload optimization in managed spark clusters at {Microsoft}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3122--3134", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476388", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476388", abstract = "Today cloud companies offer fully managed Spark services. This has made it easy to onboard new customers but has also increased the volume of users and their workload sizes. However, both cloud providers and users lack the tools and time to optimize \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Akidau:2021:WSP, author = "Tyler Akidau and Edmon Begoli and Slava Chernyak and Fabian Hueske and Kathryn Knight and Kenneth Knowles and Daniel Mills and Dan Sotolongo", title = "Watermarks in stream processing systems: semantics and comparative analysis of {Apache Flink} and {Google} cloud dataflow", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3135--3147", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476389", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476389", abstract = "Streaming data processing is an exercise in taming disorder: from oftentimes huge torrents of information, we hope to extract powerful and timely analyses. But when dealing with streaming data, the unbounded and temporally disordered nature of real-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Power:2021:CBD, author = "Conor Power and Hiren Patel and Alekh Jindal and Jyoti Leeka and Bob Jenkins and Michael Rys and Ed Triou and Dexin Zhu and Lucky Katahanas and Chakrapani Bhat Talapady and Joshua Rowe and Fan Zhang and Rich Draves and Marc Friedman and Ivan Santa Maria Filho and Amrish Kumar", title = "The {Cosmos} big data platform at {Microsoft}: over a decade of progress and a decade to look forward", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3148--3161", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476390", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476390", abstract = "The twenty-first century has been dominated by the need for large scale data processing, marking the birth of big data platforms such as Cosmos. This paper describes the evolution of the exabyte-scale Cosmos big data platform at Microsoft; our journey \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pandis:2021:EAR, author = "Ippokratis Pandis", title = "The evolution of {Amazon Redshift}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3162--3174", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476391", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476391", abstract = "In 2013, Amazon Web Services revolutionized the data warehousing industry by launching Amazon Redshift [7], the first fully managed, petabyte-scale enterprise-grade cloud data warehouse. Amazon Redshift made it simple and cost-effective to efficiently \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Razniewski:2021:LMK, author = "Simon Razniewski and Hiba Arnaout and Shrestha Ghosh and Fabian Suchanek", title = "On the limits of machine knowledge: completeness, recall and negation in web-scale knowledge bases", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3175--3177", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476401", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476401", abstract = "General-purpose knowledge bases (KBs) are an important component of several data-driven applications. Pragmatically constructed from available web sources, these KBs are far from complete, which poses a set of challenges in curation as well as \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Orr:2021:MMP, author = "Laurel Orr and Atindriyo Sanyal and Xiao Ling and Karan Goel and Megan Leszczynski", title = "Managing {ML} pipelines: feature stores and the coming wave of embedding ecosystems", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3178--3181", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476402", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476402", abstract = "The industrial machine learning pipeline requires iterating on model features, training and deploying models, and monitoring deployed models at scale. Feature stores were developed to manage and standardize the engineer's workflow in this end-to-end \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2021:DAM, author = "Yuliang Li and Xiaolan Wang and Zhengjie Miao and Wang-Chiew Tan", title = "Data augmentation for {ML}-driven data preparation and integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3182--3185", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476403", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476403", abstract = "In recent years, we have witnessed the development of novel data augmentation (DA) techniques for creating additional training data needed by machine learning based solutions. In this tutorial, we will provide a comprehensive overview of techniques \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zalipynis:2021:ADP, author = "Ramon Antonio Rodriges Zalipynis", title = "Array {DBMS}: past, present, and (near) future", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3186--3189", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476404", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476404", abstract = "Array DBMSs strive to be the best systems for managing, processing, and even visualizing big N -d arrays. The last decade blossomed with R\&D in array DBMS, making it a young and fast-evolving area. We present the first comprehensive tutorial on array \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2021:MLD, author = "Guoliang Li and Xuanhe Zhou and Lei Cao", title = "Machine learning for databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3190--3193", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476405", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476405", abstract = "Machine learning techniques have been proposed to optimize the databases. For example, traditional empirical database optimization techniques (e.g., cost estimation, join order selection, knob tuning, index and view advisor) cannot meet the high-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kargar:2021:ELN, author = "Saeed Kargar and Faisal Nawab", title = "Extending the lifetime of {NVM}: challenges and opportunities", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3194--3197", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476406", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476406", abstract = "Recently, Non-Volalile Memory (NVM) technology has revolutionized the landscape or memory systems. With many advantages, such as non volatility and near zero standby power consumption, these byte-addressable memory technologies are taking the place of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Echihabi:2021:NTH, author = "Karima Echihabi and Kostas Zoumpatianos and Themis Palpanas", title = "New trends in high-{D} vector similarity search: {AI}-driven, progressive, and distributed", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3198--3201", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476407", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476407", abstract = "Similarity search is a core operation of many critical applications, involving massive collections of high-dimensional (high-d) objects. Objects can be data series, text, multimedia, graphs, database tables or deep network embeddings. In this tutorial, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jindal:2021:MLC, author = "Alekh Jindal and Matteo Interlandi", title = "Machine learning for cloud data systems: the progress so far and the path forward", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3202--3205", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476408", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476408", abstract = "The goal of this tutorial is to educate the audience about the state of the art in ML for cloud data systems, both in research and in practice. The tutorial is divided in two parts: the progress, and the path forward. Part I covers the recent successes \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Davidson:2021:JCT, author = "Susan B. Davidson", title = "It's not just cookies and tea", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3206--3206", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476409", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476409", abstract = "Three of the major research themes over my career have been concurrency, integration and provenance. In this talk, I will explain why these themes are not only important in database research, but how they have played a role in my personal success. I \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Neumann:2021:ECQ, author = "Thomas Neumann", title = "Evolution of a compiling query engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3207--3210", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476410", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476410", abstract = "In 2011 we showed how to use dynamic code generation to process queries in a data-centric manner. This execution model can produce compact and efficient code and was successfully used by both our own systems and systems of other groups. As the systems \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pavlo:2021:MYD, author = "Andrew Pavlo and Matthew Butrovich and Lin Ma and Prashanth Menon and Wan Shen Lim and Dana {Van Aken} and William Zhang", title = "Make your database system dream of electric sheep: towards self-driving operation", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3211--3221", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476411", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476411", abstract = "Database management systems (DBMSs) are notoriously difficult to deploy and administer. Self-driving DBMSs seek to remove these impediments by managing themselves automatically. Despite decades of DBMS auto-tuning research, a truly autonomous, self-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kraska:2021:TIO, author = "Tim Kraska", title = "Towards instance-optimized data systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3222--3232", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476392", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476392", abstract = "In recent years, we have seen increased interest in applying machine learning to system problems. For example, there has been work on applying machine learning to improve query optimization, indexing, storage layouts, scheduling, log-structured merge \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Weikum:2021:KGD, author = "Gerhard Weikum", title = "Knowledge graphs 2021: a data odyssey", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3233--3238", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476393", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476393", abstract = "Providing machines with comprehensive knowledge of the world's entities and their relationships has been a long-standing vision and challenge for AI. Over the last 15 years, huge knowledge bases, also known as knowledge graphs, have been automatically \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ives:2021:FDB, author = "Zachary G. Ives and Rachel Pottinger and Arun Kumar and Johannes Gehrke and Jana Giceva", title = "The future of data(base) education: is the ``cow book'' dead?", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3239--3239", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476394", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476394", abstract = "This panel encourages a debate over the future of database education and its relationship to Data Science: Are Computer Science (CS) and Data Science (DS) different disciplines about to split, and how does that effect how we teach our field? Is there a ``data'' course that belongs in CS that all of our students should take? Who is the traditional database course, e.g. based on the ``cow book'', relevant to? What traditional topics should we not be teaching in our core data course(s) and which ones should be added? What do we teach the student who has one elective for data science? How does our community position itself for leadership in CS given the popularity of DS?", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Remis:2021:UVI, author = "Luis Remis and Chaunt{\'e} W. Lacewell", title = "Using {VDMS} to index and search {100M} images", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "12", pages = "3240--3252", month = jul, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3476311.3476381", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:41:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3476311.3476381", abstract = "Data scientists spend most of their time dealing with data preparation, rather than doing what they know best: build machine learning models and algorithms to solve previously unsolvable problems. In this paper, we describe the Visual Data Management \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2021:TEF, author = "Jian Liu and Kefei Wang and Feng Chen", title = "{TSCache}: an efficient flash-based caching scheme for time-series data workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3253--3266", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484225", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484225", abstract = "Time-series databases are becoming an indispensable component in today's data centers. In order to manage the rapidly growing time-series data, we need an effective and efficient system solution to handle the huge traffic of time-series data queries. A \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2021:MRL, author = "Huayi Wang and Jingfan Meng and Long Gong and Jun Xu and Mitsunori Ogihara", title = "{MP-RW-LSH}: an efficient multi-probe {LSH} solution to {ANNS-L$_1$}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3267--3280", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484226", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484226", abstract = "Approximate Nearest Neighbor Search (ANNS) is a fundamental algorithmic problem, with numerous applications in many areas of computer science. Locality-Sensitive Hashing (LSH) is one of the most popular solution approaches for ANNS. A common shortcoming of many LSH schemes is that since they probe only a single bucket in a hash table, they need to use a large number of hash tables to achieve a high query accuracy. For ANNS-L2, a multi-probe scheme was proposed to overcome this drawback by strategically probing multiple buckets in a hash table. In this work, we propose MP-RW-LSH, the first and so far only multi-probe LSH solution to ANNS in L1 distance, and show that it achieves a better tradeoff between scalability and query efficiency than all existing LSH-based solutions. We also explain why a state-of-the-art ANNS-L1 solution called Cauchy projection LSH (CP-LSH) is fundamentally not suitable for multi-probe extension. Finally, as a use case, we construct, using MP-RW-LSH as the underlying ``ANNS-L$_1$ engine'', a new ANNS-E (E for edit distance) solution that beats the state of the art.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mailis:2021:VSK, author = "Theofilos Mailis and Yannis Kotidis and Stamatis Christoforidis and Evgeny Kharlamov and Yannis Ioannidis", title = "View selection over knowledge graphs in triple stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3281--3294", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484227", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484227", abstract = "Knowledge Graphs (KGs) are collections of interconnected and annotated entities that have become powerful assets for data integration, search enhancement, and other industrial applications. Knowledge Graphs such as DBPEDIA may contain billion of triple \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2021:FHO, author = "Dongjie Li and Siyi Lv and Yanyu Huang and Yijing Liu and Tong Li and Zheli Liu and Liang Guo", title = "Frequency-hiding order-preserving encryption with small client storage", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3295--3307", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484228", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484228", abstract = "The range query on encrypted databases is usually implemented using the order-preserving encryption (OPE) technique which preserves the order of plaintexts. Since the frequency leakage of plaintexts makes OPE vulnerable to frequency-analyzing attacks, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Koutsoukos:2021:MMR, author = "Dimitrios Koutsoukos and Ingo M{\"u}ller and Renato Marroqu{\'\i}n and Ana Klimovic and Gustavo Alonso", title = "{Modularis}: modular relational analytics over heterogeneous distributed platforms", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3308--3321", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484229", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484229", abstract = "The enormous quantity of data produced every day together with advances in data analytics has led to a proliferation of data management and analysis systems. Typically, these systems are built around highly specialized monolithic operators optimized for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lou:2021:TTA, author = "Yunkai Lou and Chaokun Wang and Tiankai Gu and Hao Feng and Jun Chen and Jeffrey Xu Yu", title = "Time-topology analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3322--3334", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484230", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484230", abstract = "Many real-world networks have been evolving, and are finely modeled as temporal graphs from the viewpoint of the graph theory. A temporal graph is informative, and always contains two types of information, i.e., the temporal information and topological \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bernau:2021:QIC, author = "Daniel Bernau and G{\"u}nther Eibl and Philip W. Grassal and Hannah Keller and Florian Kerschbaum", title = "Quantifying identifiability to choose and audit $ \epsilon $ in differentially private deep learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3335--3347", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484231", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484231", abstract = "Differential privacy allows bounding the influence that training data records have on a machine learning model. To use differential privacy in machine learning, data scientists must choose privacy parameters ( \epsilon, \delta ). Choosing meaningful privacy \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Laigner:2021:DMM, author = "Rodrigo Laigner and Yongluan Zhou and Marcos Antonio Vaz Salles and Yijian Liu and Marcos Kalinowski", title = "Data management in microservices: state of the practice, challenges, and research directions", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3348--3361", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484232", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484232", abstract = "Microservices have become a popular architectural style for data-driven applications, given their ability to functionally decompose an application into small and autonomous services to achieve scalability, strong isolation, and specialization of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ammerlaan:2021:PDM, author = "Remmelt Ammerlaan and Gilbert Antonius and Marc Friedman and H. M. Sajjad Hossain and Alekh Jindal and Peter Orenberg and Hiren Patel and Shi Qiao and Vijay Ramani and Lucas Rosenblatt and Abhishek Roy and Irene Shaffer and Soundarajan Srinivasan and Markus Weimer", title = "{PerfGuard}: deploying {ML}-for-systems without performance regressions, almost!", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3362--3375", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484233", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484233", abstract = "Modern data processing systems require optimization at massive scale, and using machine learning to optimize these systems (ML-for-systems) has shown promising results. Unfortunately, ML-for-systems is subject to over generalizations that do not capture \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ding:2021:DDS, author = "Bailu Ding and Surajit Chaudhuri and Johannes Gehrke and Vivek Narasayya", title = "{DSB}: a decision support benchmark for workload-driven and traditional database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3376--3388", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484234", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484234", abstract = "We describe a new benchmark, DSB, for evaluating both workload-driven and traditional database systems on modern decision support workloads. DSB is adapted from the widely-used industrial-standard TPC-DS benchmark. It enhances the TPC-DS benchmark with \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hernandez:2021:CHP, author = "Daniel Hern{\'a}ndez and Luis Gal{\'a}rraga and Katja Hose", title = "Computing how-provenance for {SPARQL} queries via query rewriting", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3389--3401", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484235", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484235", abstract = "Over the past few years, we have witnessed the emergence of large knowledge graphs built by extracting and combining information from multiple sources. This has propelled many advances in query processing over knowledge graphs, however the aspect of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2021:UUD, author = "Junxiong Wang and Immanuel Trummer and Debabrota Basu", title = "{UDO}: universal database optimization using reinforcement learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3402--3414", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484236", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484236", abstract = "UDO is a versatile tool for offline tuning of database systems for specific workloads. UDO can consider a variety of tuning choices, reaching from picking transaction code variants over index selections up to database system parameter tuning. UDO uses \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Feldmann:2021:ITA, author = "Anja Feldmann", title = "{Internet} traffic analysis at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3415--3415", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484237", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484237", abstract = "In this talk, I will use multiple internet measurement studies as examples to outline the challenges that we face when performing internet-scale traffic analysis, including implications of the COVID-19 pandemic on internet traffic as well as detecting \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Koutra:2021:PSG, author = "Danai Koutra", title = "The power of summarization in graph mining and learning: smaller data, faster methods, more interpretability", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3416--3416", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484238", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484238", abstract = "Our ability to generate, collect, and archive data related to everyday activities, such as interacting on social media, browsing the web, and monitoring well-being, is rapidly increasing. Getting the most benefit from this large-scale data requires \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shah:2021:SPL, author = "Nigam Shah", title = "Summarizing patients like mine via an on-demand consultation service", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3417--3417", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484242", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484242", abstract = "Using evidence derived from previously collected medical records to guide patient care has been a long-standing vision of clinicians and informaticians, and one with the potential to transform medical practice. We offered an on-demand consultation \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Vanschoren:2021:TSO, author = "Joaquin Vanschoren", title = "Towards scalable online machine learning collaborations with {OpenML}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3418--3418", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484239", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484239", abstract = "Is massively collaborative machine learning possible? Can we share and organize our collective knowledge of machine learning to solve ever more challenging problems? In a way, yes: as a community, we are already very successful at developing high-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Vartak:2021:MMI, author = "Manasi Vartak", title = "From {ML} models to intelligent applications: the rise of {MLOps}", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3419--3419", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484240", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484240", abstract = "The last 5+ years in ML have focused on building the best models, hyperparameter optimization, parallel training, massive neural networks, etc. Now that the building of models has become easy, models are being integrated into every piece of software and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zaharia:2021:DPF, author = "Matei Zaharia", title = "Designing production-friendly machine learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "14", number = "13", pages = "3420--3420", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3484224.3484241", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 29 16:38:15 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3484224.3484241", abstract = "Building production ML applications is difficult because of their resource cost and complex failure modes. I will discuss these challenges from two perspectives: the Stanford DAWN Lab and experience with large-scale commercial ML users at Databricks. I \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2021:ASA, author = "Kang Zhao and Liuyihan Song and Yingya Zhang and Pan Pan and Yinghui Xu and Rong Jin", title = "{ANN} softmax: acceleration of extreme classification training", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "1", pages = "1--10", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3485450.3485451", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jan 20 16:04:55 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3485450.3485451", abstract = "Thanks to the popularity of GPU and the growth of its computational power, more and more deep learning tasks, such as face recognition, image retrieval and word embedding, can take advantage of extreme classification to improve accuracy. However, it \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yu:2021:WTD, author = "Gyeong-In Yu and Saeed Amizadeh and Sehoon Kim and Artidoro Pagnoni and Ce Zhang and Byung-Gon Chun and Markus Weimer and Matteo Interlandi", title = "{WindTunnel}: towards differentiable {ML} pipelines beyond a single model", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "1", pages = "11--20", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3485450.3485452", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jan 20 16:04:55 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3485450.3485452", abstract = "While deep neural networks (DNNs) have shown to be successful in several domains like computer vision, non-DNN models such as linear models and gradient boosting trees are still considered state-of-the-art over tabular data. When using these models, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Skiadopoulos:2021:DDO, author = "Athinagoras Skiadopoulos and Qian Li and Peter Kraft and Kostis Kaffes and Daniel Hong and Shana Mathew and David Bestor and Michael Cafarella and Vijay Gadepally and Goetz Graefe and Jeremy Kepner and Christos Kozyrakis and Tim Kraska and Michael Stonebraker and Lalith Suresh and Matei Zaharia", title = "{DBOS}: a {DBMS}-oriented operating system", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "1", pages = "21--30", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3485450.3485454", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jan 20 16:04:55 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3485450.3485454", abstract = "This paper lays out the rationale for building a completely new operating system (OS) stack. Rather than build on a single node OS together with separate cluster schedulers, distributed filesystems, and network managers, we argue that a distributed \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jain:2021:DIA, author = "Arjit Jain and Sunita Sarawagi and Prithviraj Sen", title = "Deep indexed active learning for matching heterogeneous entity representations", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "1", pages = "31--45", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3485450.3485455", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jan 20 16:04:55 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3485450.3485455", abstract = "Given two large lists of records, the task in entity resolution (ER) is to find the pairs from the Cartesian product of the lists that correspond to the same real world entity. Typically, passive learning methods on such tasks require large amounts of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2021:LQR, author = "Xuanhe Zhou and Guoliang Li and Chengliang Chai and Jianhua Feng", title = "A learned query rewrite system using {Monte Carlo} tree search", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "1", pages = "46--58", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3485450.3485456", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jan 20 16:04:55 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3485450.3485456", abstract = "Query rewrite transforms a SQL query into an equivalent one but with higher performance. However, SQL rewrite is an NP-hard problem, and existing approaches adopt heuristics to rewrite the queries. These heuristics have two main limitations. First, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2021:DCP, author = "Yin Lin and Brit Youngmann and Yuval Moskovitch and H. V. Jagadish and Tova Milo", title = "On detecting cherry-picked generalizations", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "1", pages = "59--71", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3485450.3485457", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jan 20 16:04:55 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3485450.3485457", abstract = "Generalizing from detailed data to statements in a broader context is often critical for users to make sense of large data sets. Correspondingly, poorly constructed generalizations might convey misleading information even if the statements are \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2021:FNF, author = "Jiayi Wang and Chengliang Chai and Jiabin Liu and Guoliang Li", title = "{FACE}: a normalizing flow based cardinality estimator", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "1", pages = "72--84", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3485450.3485458", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jan 20 16:04:55 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3485450.3485458", abstract = "Cardinality estimation is one of the most important problems in query optimization. Recently, machine learning based techniques have been proposed to effectively estimate cardinality, which can be broadly classified into query-driven and data-driven \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2021:LCE, author = "Ji Sun and Jintao Zhang and Zhaoyan Sun and Guoliang Li and Nan Tang", title = "Learned cardinality estimation: a design space exploration and a comparative evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "1", pages = "85--97", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3485450.3485459", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jan 20 16:04:55 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3485450.3485459", abstract = "Cardinality estimation is core to the query optimizers of DBMSs. Non-learned methods, especially based on histograms and samplings, have been widely used in commercial and open-source DBMSs. Nevertheless, histograms and samplings can only be used to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{He:2021:DAD, author = "Dong He and Maureen Daum and Walter Cai and Magdalena Balazinska", title = "{DeepEverest}: accelerating declarative top-{$K$} queries for deep neural network interpretation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "1", pages = "98--111", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3485450.3485460", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jan 20 16:04:55 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3485450.3485460", abstract = "We design, implement, and evaluate DeepEverest, a system for the efficient execution of interpretation by example queries over the activation values of a deep neural network. DeepEverest consists of an efficient indexing technique and a query execution \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chatterjee:2021:CCC, author = "Subarna Chatterjee and Meena Jagadeesan and Wilson Qin and Stratos Idreos", title = "{Cosine}: a cloud-cost optimized self-designing key--value storage engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "1", pages = "112--126", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3485450.3485461", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jan 20 16:04:55 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3485450.3485461", abstract = "We present a self-designing key-value storage engine, Cosine, which can always take the shape of the close to ``perfect'' engine architecture given an input workload, a cloud budget, a target performance, and required cloud SLAs. By identifying and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Adnan:2021:ARS, author = "Muhammad Adnan and Yassaman Ebrahimzadeh Maboud and Divya Mahajan and Prashant J. Nair", title = "Accelerating recommendation system training by leveraging popular choices", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "1", pages = "127--140", month = sep, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3485450.3485462", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jan 20 16:04:55 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3485450.3485462", abstract = "Recommender models are commonly used to suggest relevant items to a user for e-commerce and online advertisement-based applications. These models use massive embedding tables to store numerical representation of items' and users' categorical variables \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2021:BCE, author = "Jianye Yang and Yun Peng and Wenjie Zhang", title = "$ (p, q) $-biclique counting and enumeration for large sparse bipartite graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "141--153", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489497", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489497", abstract = "In this paper, we study the problem of ( p, q)-biclique counting and enumeration for large sparse bipartite graphs. Given a bipartite G = ( U, V, E), and two integer parameters p and q, we aim to efficiently count and enumerate all (p, q)-bicliques in G, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Graur:2021:EQL, author = "Dan Graur and Ingo M{\"u}ller and Mason Proffitt and Ghislain Fourny and Gordon T. Watts and Gustavo Alonso", title = "Evaluating query languages and systems for high-energy physics data", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "154--168", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489498", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489498", abstract = "In the domain of high-energy physics (HEP), query languages in general and SQL in particular have found limited acceptance. This is surprising since HEP data analysis matches the SQL model well: the data is fully structured and queried using mostly \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hao:2021:DHC, author = "Kongzhang Hao and Long Yuan and Wenjie Zhang", title = "Distributed hop-constrained $s$--$t$ simple path enumeration at billion scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "169--182", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489499", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489499", abstract = "Hop-constrained s-t simple path (HC-s-t path) enumeration is a fundamental problem in graph analysis and has received considerable attention recently. Straightforward distributed solutions are inefficient and suffer from poor scalabiltiy when addressing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fang:2021:EAO, author = "Jingzhi Fang and Yanyan Shen and Yue Wang and Lei Chen", title = "{ETO}: accelerating optimization of {DNN} operators by high-performance tensor program reuse", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "183--195", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489500", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489500", abstract = "Recently, deep neural networks (DNNs) have achieved great success in various applications, where low inference latency is important. Existing solutions either manually tune the kernel library or utilize search-based compilation to reduce the operator \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Grulich:2021:BEE, author = "Philipp Marian Grulich and Steffen Zeuch and Volker Markl", title = "{Babelfish}: efficient execution of polyglot queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "196--210", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489501", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489501", abstract = "Today's users of data processing systems come from different domains, have different levels of expertise, and prefer different programming languages. As a result, analytical workload requirements shifted from relational to polyglot queries involving \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2021:BCU, author = "Alexander Zhou and Yue Wang and Lei Chen", title = "Butterfly counting on uncertain bipartite graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "211--223", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489502", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489502", abstract = "When considering uncertain bipartite networks, the number of instances of the popular graphlet structure the butterfly may be used as an important metric to quickly gauge information about the network. This Uncertain Butterfly Count has practical usages \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cui:2021:MGG, author = "Yue Cui and Kai Zheng and Dingshan Cui and Jiandong Xie and Liwei Deng and Feiteng Huang and Xiaofang Zhou", title = "{METRO}: a generic graph neural network framework for multivariate time series forecasting", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "224--236", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489503", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489503", abstract = "Multivariate time series forecasting has been drawing increasing attention due to its prevalent applications. It has been commonly assumed that leveraging latent dependencies between pairs of variables can enhance prediction accuracy. However, most \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ge:2021:LAE, author = "Congcong Ge and Xiaoze Liu and Lu Chen and Yunjun Gao and Baihua Zheng", title = "{LargeEA}: aligning entities for large-scale knowledge graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "237--245", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489504", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489504", abstract = "Entity alignment (EA) aims to find equivalent entities in different knowledge graphs (KGs). Current EA approaches suffer from scalability issues, limiting their usage in real-world EA scenarios. To tackle this challenge, we propose LargeEA to align \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lu:2021:HHG, author = "Kejing Lu and Mineichi Kudo and Chuan Xiao and Yoshiharu Ishikawa", title = "{HVS}: hierarchical graph structure based on {Voronoi} diagrams for solving approximate nearest neighbor search", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "246--258", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489506", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489506", abstract = "Approximate nearest neighbor search (ANNS) is a fundamental problem that has a wide range of applications in information retrieval and data mining. Among state-of-the-art in-memory ANNS methods, graph-based methods have attracted particular interest \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Arman:2021:OHP, author = "Arif Arman and Dmitri Loguinov", title = "{Origami}: a high-performance mergesort framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "259--271", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489507", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489507", abstract = "Mergesort is a popular algorithm for sorting real-world workloads as it is immune to data skewness, suitable for parallelization using vectorized intrinsics, and relatively simple to multi-thread. In this paper, we introduce Origami, an in-memory merge-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2021:LSL, author = "Renzhi Wu and Bolin Ding and Xu Chu and Zhewei Wei and Xiening Dai and Tao Guan and Jingren Zhou", title = "Learning to be a statistician: learned estimator for number of distinct values", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "272--284", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489508", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489508", abstract = "Estimating the number of distinct values (NDV) in a column is useful for many tasks in database systems, such as columnstore compression and data profiling. In this work, we focus on how to derive accurate NDV estimations from random (online/offline) \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yu:2021:PFP, author = "Shangdi Yu and Yiqiu Wang and Yan Gu and Laxman Dhulipala and Julian Shun", title = "{ParChain}: a framework for parallel hierarchical agglomerative clustering using nearest-neighbor chain", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "285--298", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489509", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489509", abstract = "This paper studies the hierarchical clustering problem, where the goal is to produce a dendrogram that represents clusters at varying scales of a data set. We propose the ParChain framework for designing parallel hierarchical agglomerative clustering \ldots{}.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chauhan:2021:ARP, author = "Komal Chauhan and Kartik Jain and Sayan Ranu and Srikanta Bedathur and Amitabha Bagchi", title = "Answering regular path queries through exemplars", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "299--311", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489510", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489510", abstract = "Regular simple path query (RPQ) is one of the fundamental operators in graph analytics. In an RPQ, the input is a graph, a source node and a regular expression. The goal is to identify all nodes that are connected to the source through a simple path \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Miao:2021:SHE, author = "Xupeng Miao and Hailin Zhang and Yining Shi and Xiaonan Nie and Zhi Yang and Yangyu Tao and Bin Cui", title = "{HET}: scaling out huge embedding model training via cache-enabled distributed framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "312--320", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489511", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489511", abstract = "Embedding models have been an effective learning paradigm for high-dimensional data. However, one open issue of embedding models is that their representations (latent factors) often result in large parameter space. We observe that existing distributed \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2021:FFG, author = "Pengfei Li and Yu Hua and Jingnan Jia and Pengfei Zuo", title = "{FINEdex}: a fine-grained learned index scheme for scalable and concurrent memory systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "321--334", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489512", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489512", abstract = "Index structures in memory systems become important to improve the entire system performance. The promising learned indexes leverage deep-learning models to complement existing index structures and obtain significant performance improvements. Existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bai:2021:TTA, author = "Jiyang Bai and Peixiang Zhao", title = "{TaGSim}: type-aware graph similarity learning and computation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "335--347", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489513", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489513", abstract = "Computing similarity between graphs is a fundamental and critical problem in graph-based applications, and one of the most commonly used graph similarity measures is graph edit distance (GED), defined as the minimum number of graph edit operations that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2021:AIC, author = "Yuqing Zhu and Jing Tang and Xueyan Tang and Lei Chen", title = "Analysis of influence contribution in social advertising", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "348--360", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489514", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489514", abstract = "Online Social Network (OSN) providers usually conduct advertising campaigns by inserting social ads into promoted posts. Whenever a user engages in a promoted ad, she may further propagate the promoted ad to her followers recursively and the propagation \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Theodorakis:2021:SSN, author = "Georgios Theodorakis and Fotios Kounelis and Peter Pietzuch and Holger Pirk", title = "{Scabbard}: single-node fault-tolerant stream processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "361--374", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489515", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489515", abstract = "Single-node multi-core stream processing engines (SPEs) can process hundreds of millions of tuples per second. Yet making them fault-tolerant with exactly-once semantics while retaining this performance is an open challenge: due to the limited I/O \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Konstantinidis:2021:EPC, author = "George Konstantinidis and Jet Holt and Adriane Chapman", title = "Enabling personal consent in databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "375--387", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489516", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489516", abstract = "Users have the right to consent to the use of their data, but current methods are limited to very coarse-grained expressions of consent, as ``opt-in/opt-out'' choices for certain uses. In this paper we identify the need for fine-grained consent management \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2021:ESB, author = "Yejia Liu and Weiyuan Wu and Lampros Flokas and Jiannan Wang and Eugene Wu", title = "Enabling {SQL}-based training data debugging for federated learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "388--400", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494125", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494125", abstract = "How can we debug a logistic regression model in a federated learning setting when seeing the model behave unexpectedly (e.g., the model rejects all high-income customers' loan applications)? The SQL-based training data debugging framework has proved \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Vaidya:2021:LQL, author = "Kapil Vaidya and Anshuman Dutt and Vivek Narasayya and Surajit Chaudhuri", title = "Leveraging query logs and machine learning for parametric query optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "401--413", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494126", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494126", abstract = "Parametric query optimization (PQO) must address two problems: identify a relatively small number of plans to cache for a parameterized query (populateCache), and efficiently select the best cached plan to use for executing any instance of the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lu:2021:PTS, author = "Yao Lu and Srikanth Kandula and Arnd Christian K{\"o}nig and Surajit Chaudhuri", title = "Pre-training summarization models of structured datasets for cardinality estimation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "414--426", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494127", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494127", abstract = "We consider the problem of pre-training models which convert structured datasets into succinct summaries that can be used to answer cardinality estimation queries. Doing so avoids per-dataset training and, in our experiments, reduces the time to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rao:2021:XEF, author = "Susie Xi Rao and Shuai Zhang and Zhichao Han and Zitao Zhang and Wei Min and Zhiyao Chen and Yinan Shan and Yang Zhao and Ce Zhang", title = "{xFraud}: explainable fraud transaction detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "427--436", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494128", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494128", abstract = "At online retail platforms, it is crucial to actively detect the risks of transactions to improve customer experience and minimize financial loss. In this work, we propose xFraud, an explainable fraud transaction prediction framework which is mainly \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yuan:2021:SMG, author = "Ye Yuan and Delong Ma and Zhenyu Wen and Zhiwei Zhang and Guoren Wang", title = "Subgraph matching over graph federation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "437--450", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494129", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494129", abstract = "Many real-life applications require processing graph data across heterogeneous sources. In this paper, we define the graph federation that indicates that the graph data sources are temporarily federated and offer their data for users. Next, we propose a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Niu:2021:PBD, author = "Xing Niu and Boris Glavic and Ziyu Liu and Pengyuan Li and Dieter Gawlick and Vasudha Krishnaswamy and Zhen Hua Liu and Danica Porobic", title = "Provenance-based data skipping", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "451--464", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494130", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494130", abstract = "Database systems use static analysis to determine upfront which data is needed for answering a query and use indexes and other physical design techniques to speed-up access to that data. However, for important classes of queries, e.g., HAVING and top-k \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jin:2021:DTL, author = "Di Jin and Bunyamin Sisman and Hao Wei and Xin Luna Dong and Danai Koutra", title = "Deep transfer learning for multi-source entity linkage via domain adaptation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "465--477", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494131", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494131", abstract = "Multi-source entity linkage focuses on integrating knowledge from multiple sources by linking the records that represent the same real world entity. This is critical in high-impact applications such as data cleaning and user stitching. The state-of-the-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xing:2021:EEI, author = "Lu Xing and Eric Lee and Tong An and Bo-Cheng Chu and Ahmed Mahmood and Ahmed M. Aly and Jianguo Wang and Walid G. Aref", title = "An experimental evaluation and investigation of waves of misery in $r$-trees", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "478--490", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494132", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494132", abstract = "Waves of misery is a phenomenon where spikes of many node splits occur over short periods of time in tree indexes. Waves of misery negatively affect the performance of tree indexes in insertion-heavy workloads. Waves of misery have been first observed \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2021:PPR, author = "Yongyi Liu and Ahmed R. Mahmood and Amr Magdy and Sergio Rey", title = "{PRUC}: {P-regions} with user-defined constraint", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "491--503", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494133", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494133", abstract = "This paper introduces a generalized spatial regionalization problem, namely, PRUC ( P -Regions with User-defined Constraint) that partitions spatial areas into homogeneous regions. PRUC accounts for user-defined constraints imposed over aggregate region \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2021:PIR, author = "Yile Chen and Xiucheng Li and Gao Cong and Cheng Long and Zhifeng Bao and Shang Liu and Wanli Gu and Fuzheng Zhang", title = "Points-of-interest relationship inference with spatial-enriched graph neural networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "504--512", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494134", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494134", abstract = "As a fundamental component in location-based services, inferring the relationship between points-of-interests (POIs) is very critical for service providers to offer good user experience to business owners and customers. Most of the existing methods for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chan:2021:SSA, author = "Tsz Nam Chan and Pak Lon Ip and Leong Hou U. and Byron Choi and Jianliang Xu", title = "{SAFE}: a share-and-aggregate bandwidth exploration framework for kernel density visualization", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "513--526", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494135", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494135", abstract = "Kernel density visualization (KDV) has been the de facto method in many spatial analysis tasks, including ecological modeling, crime hotspot detection, traffic accident hotspot detection, and disease outbreak detection. In these tasks, domain experts \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dittrich:2021:NYD, author = "Jens Dittrich and Joris Nix and Christian Sch{\"o}n", title = "The next 50 years in database indexing or: the case for automatically generated index structures", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "527--540", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494136", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494136", abstract = "Index structures are a building block of query processing and computer science in general. Since the dawn of computer technology there have been index structures. And since then, a myriad of index structures are being invented and published each and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chapnik:2021:DDA, author = "Koral Chapnik and Ilya Kolchinsky and Assaf Schuster", title = "{DARLING}: data-aware load shedding in complex event processing systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "541--554", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494137", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494137", abstract = "Complex event processing (CEP) is widely employed to detect user-defined combinations, or patterns, of events in massive streams of incoming data. Numerous applications such as healthcare, fraud detection, and more, use CEP technologies to capture \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhuo:2021:RMO, author = "Danyang Zhuo and Kaiyuan Zhang and Zhuohan Li and Siyuan Zhuang and Stephanie Wang and Ang Chen and Ion Stoica", title = "Rearchitecting in-memory object stores for low latency", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "555--568", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494138", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494138", abstract = "Low latency is increasingly critical for modern workloads, to the extent that compute functions are explicitly scheduled to be co-located with their in-memory object stores for faster access. However, the traditional object store architecture mandates \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ma:2021:MTE, author = "Pingchuan Ma and Shuai Wang", title = "{MT-teql}: evaluating and augmenting neural {NLIDB} on real-world linguistic and schema variations", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "569--582", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494139", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494139", abstract = "Natural Language Interface to Database (NLIDB) translates human utterances into SQL queries and enables database interactions for non-expert users. Recently, neural network models have become a major approach to implementing NLIDB. However, neural NLIDB \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shi:2021:TPE, author = "Jessica Shi and Laxman Dhulipala and Julian Shun", title = "Theoretically and practically efficient parallel nucleus decomposition", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "583--596", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494140", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494140", abstract = "This paper studies the nucleus decomposition problem, which has been shown to be useful in finding dense substructures in graphs. We present a novel parallel algorithm that is efficient both in theory and in practice. Our algorithm achieves a work \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lu:2021:AHP, author = "Baotong Lu and Jialin Ding and Eric Lo and Umar Farooq Minhas and Tianzheng Wang", title = "{APEX}: a high-performance learned index on persistent memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "597--610", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494141", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494141", abstract = "The recently released persistent memory (PM) offers high performance, persistence, and is cheaper than DRAM. This opens up new possibilities for indexes that operate and persist data directly on the memory bus. Recent learned indexes exploit data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Campos:2021:UTS, author = "David Campos and Tung Kieu and Chenjuan Guo and Feiteng Huang and Kai Zheng and Bin Yang and Christian S. Jensen", title = "Unsupervised time series outlier detection with diversity-driven convolutional ensembles", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "611--623", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494142", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494142", abstract = "With the sweeping digitalization of societal, medical, industrial, and scientific processes, sensing technologies are being deployed that produce increasing volumes of time series data, thus fueling a plethora of new or improved applications. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Miao:2021:EED, author = "Xiaoye Miao and Yangyang Wu and Lu Chen and Yunjun Gao and Jun Wang and Jianwei Yin", title = "Efficient and effective data imputation with influence functions", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "624--632", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494143", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494143", abstract = "Data imputation has been extensively explored to solve the missing data problem. The dramatically rising volume of missing data makes the training of imputation models computationally infeasible in real-life scenarios. In this paper, we propose an \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kochsiek:2021:PTK, author = "Adrian Kochsiek and Rainer Gemulla", title = "Parallel training of knowledge graph embedding models: a comparison of techniques", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "633--645", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494144", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494144", abstract = "Knowledge graph embedding (KGE) models represent the entities and relations of a knowledge graph (KG) using dense continuous representations called embeddings. KGE methods have recently gained traction for tasks such as knowledge graph completion and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Vitagliano:2021:DLT, author = "Gerardo Vitagliano and Lan Jiang and Felix Naumann", title = "Detecting layout templates in complex multiregion files", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "646--658", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494145", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494145", abstract = "Spreadsheets are among the most commonly used file formats for data management, distribution, and analysis. Their widespread employment makes it easy to gather large collections of data, but their flexible canvas-based structure makes automated analysis \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Maliszewski:2021:WPJ, author = "Kajetan Maliszewski and Jorge-Arnulfo Quian{\'e}-Ruiz and Jonas Traub and Volker Markl", title = "What is the price for joining securely?: benchmarking equi-joins in trusted execution environments", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "659--672", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494146", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494146", abstract = "Protection of personal data has been raised to be among the top requirements of modern systems. At the same time, it is now frequent that the owner of the data and the owner of the computing infrastructure are two entities with limited trust between \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ho:2021:ETP, author = "Van Long Ho and Nguyen Ho and Torben Bach Pedersen", title = "Efficient temporal pattern mining in big time series using mutual information", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "673--685", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494147", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494147", abstract = "Very large time series are increasingly available from an ever wider range of IoT-enabled sensors deployed in different environments. Significant insights can be gained by mining temporal patterns from these time series. Unlike traditional pattern \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2021:ELC, author = "Junhua Zhang and Long Yuan and Wentao Li and Lu Qin and Ying Zhang", title = "Efficient label-constrained shortest path queries on road networks: a tree decomposition approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "686--698", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494148", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494148", abstract = "Computing the shortest path between two vertices is a fundamental problem in road networks. Most of the existing works assume that the edges in the road networks have no labels, but in many real applications, the edges have labels and label constraints \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Suri:2021:ENC, author = "Sahaana Suri and Ihab F. Ilyas and Christopher R{\'e} and Theodoros Rekatsinas", title = "{Ember}: no-code context enrichment via similarity-based keyless joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "699--712", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494149", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494149", abstract = "Structured data, or data that adheres to a pre-defined schema, can suffer from fragmented context: information describing a single entity can be scattered across multiple datasets or tables tailored for specific business needs, with no explicit linking \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Vu:2021:IPE, author = "Tin Vu and Ahmed Eldawy and Vagelis Hristidis and Vassilis Tsotras", title = "Incremental partitioning for efficient spatial data analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "713--726", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494150", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494150", abstract = "Big spatial data has become ubiquitous, from mobile applications to satellite data. In most of these applications, data is continuously growing to huge volumes. Existing systems for big spatial data organize records at either the record-level or block-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lee:2021:LAV, author = "Doris Jung-Lin Lee and Dixin Tang and Kunal Agarwal and Thyne Boonmark and Caitlyn Chen and Jake Kang and Ujjaini Mukhopadhyay and Jerry Song and Micah Yong and Marti A. Hearst and Aditya G. Parameswaran", title = "{Lux}: always-on visualization recommendations for exploratory dataframe workflows", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "727--738", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494151", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494151", abstract = "Exploratory data science largely happens in computational notebooks with dataframe APIs, such as pandas, that support flexible means to transform, clean, and analyze data. Yet, visually exploring data in dataframes remains tedious, requiring substantial \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Petersohn:2021:FRB, author = "Devin Petersohn and Dixin Tang and Rehan Durrani and Areg Melik-Adamyan and Joseph E. Gonzalez and Anthony D. Joseph and Aditya G. Parameswaran", title = "Flexible rule-based decomposition and metadata independence in modin: a parallel dataframe system", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "3", pages = "739--751", month = nov, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3494124.3494152", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Feb 5 06:35:56 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3494124.3494152", abstract = "Dataframes have become universally popular as a means to represent data in various stages of structure, and manipulate it using a rich set of operators---thereby becoming an essential tool in the data scientists' toolbox. However, dataframe systems, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Han:2021:CED, author = "Yuxing Han and Ziniu Wu and Peizhi Wu and Rong Zhu and Jingyi Yang and Liang Wei Tan and Kai Zeng and Gao Cong and Yanzhao Qin and Andreas Pfadler and Zhengping Qian and Jingren Zhou and Jiangneng Li and Bin Cui", title = "Cardinality estimation in {DBMS}: a comprehensive benchmark evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "752--765", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503586", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503586", abstract = "Cardinality estimation (CardEst) plays a significant role in generating high-quality query plans for a query optimizer in DBMS. In the last decade, an increasing number of advanced CardEst methods (especially ML-based) have been proposed with \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2021:RRD, author = "Qizhen Zhang and Philip A. Bernstein and Daniel S. Berger and Badrish Chandramouli", title = "{Redy}: remote dynamic memory cache", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "766--779", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503587", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503587", abstract = "Redy is a cloud service that provides high performance caches using RDMA-accessible remote memory. An application can customize the performance of each cache with a service level objective (SLO) for latency and throughput. By using remote memory, it can \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Boissier:2021:RBC, author = "Martin Boissier", title = "Robust and budget-constrained encoding configurations for in-memory database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "780--793", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503588", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503588", abstract = "Data encoding has been applied to database systems for decades as it mitigates bandwidth bottlenecks and reduces storage requirements. But even in the presence of these advantages, most in-memory database systems use data encoding only conservatively as \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tan:2021:FNR, author = "Shulong Tan and Weijie Zhao and Ping Li", title = "Fast neural ranking on bipartite graph indices", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "794--803", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503589", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503589", abstract = "Neural network based ranking has been widely adopted owing to its powerful capacity in modeling complex relationships (e.g., users and items, questions and answers). Online neural network ranking, i.e., the so called fast neural ranking, is considered a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gan:2021:BSD, author = "Shaoduo Gan and Jiawei Jiang and Binhang Yuan and Ce Zhang and Xiangru Lian and Rui Wang and Jianbin Chang and Chengjun Liu and Hongmei Shi and Shengzhuo Zhang and Xianghong Li and Tengxu Sun and Sen Yang and Ji Liu", title = "{Bagua}: scaling up distributed learning with system relaxations", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "804--813", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503590", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503590", abstract = "Recent years have witnessed a growing list of systems for distributed data-parallel training. Existing systems largely fit into two paradigms, i.e., parameter server and MPI-style collective operations. On the algorithmic side, researchers have proposed \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chan:2021:SCO, author = "Tsz Nam Chan and Pak Lon Ip and Leong Hou U. and Byron Choi and Jianliang Xu", title = "{SWS}: a complexity-optimized solution for spatial-temporal kernel density visualization", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "814--827", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503591", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503591", abstract = "Spatial-temporal kernel density visualization (STKDV) has been extensively used in a wide range of applications, e.g., disease outbreak analysis, traffic accident hotspot detection, and crime hotspot detection. While STKDV can provide accurate and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2021:PFA, author = "Junxu Liu and Jian Lou and Li Xiong and Jinfei Liu and Xiaofeng Meng", title = "Projected federated averaging with heterogeneous differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "828--840", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503592", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503592", abstract = "Federated Learning (FL) is a promising framework for multiple clients to learn a joint model without directly sharing the data. In addition to high utility of the joint model, rigorous privacy protection of the data and communication efficiency are \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Haimovich:2021:PPS, author = "Daniel Haimovich and Dima Karamshuk and Thomas J. Leeper and Evgeniy Riabenko and Milan Vojnovic", title = "Popularity prediction for social media over arbitrary time horizons", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "841--849", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503593", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503593", abstract = "Predicting the popularity of social media content in real time requires approaches that efficiently operate at global scale. Popularity prediction is important for many applications, including detection of harmful viral content to enable timely content \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Doshi:2021:LWS, author = "Ishita Doshi and Dhritiman Das and Ashish Bhutani and Rajeev Kumar and Rushi Bhatt and Niranjan Balasubramanian", title = "{LANNS}: a web-scale approximate nearest neighbor lookup system", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "850--858", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503594", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503594", abstract = "Nearest neighbor search (NNS) has a wide range of applications in information retrieval, computer vision, machine learning, databases, and other areas. Existing state-of-the-art algorithm for nearest neighbor search, Hierarchical Navigable Small World \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pena:2021:FDD, author = "Eduardo H. M. Pena and Eduardo C. de Almeida and Felix Naumann", title = "Fast detection of denial constraint violations", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "859--871", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503595", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503595", abstract = "The detection of constraint-based errors is a critical task in many data cleaning solutions. Previous works perform the task either using traditional data management systems or using specialized systems that speed up error detection. Unfortunately, both \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yu:2021:CFF, author = "Bowen Yu and Guanyu Feng and Huanqi Cao and Xiaohan Li and Zhenbo Sun and Haojie Wang and Xiaowei Zhu and Weimin Zheng and Wenguang Chen", title = "{Chukonu}: a fully-featured high-performance big data framework that integrates a native compute engine into {Spark}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "872--885", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503596", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503596", abstract = "Apache Spark is a widely deployed big data analytics framework that offers such attractive features as resiliency, load-balancing, and a rich ecosystem. However, there is still plenty of room for improvement in its performance. Although a data-parallel \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jin:2021:CNM, author = "Sian Jin and Chengming Zhang and Xintong Jiang and Yunhe Feng and Hui Guan and Guanpeng Li and Shuaiwen Leon Song and Dingwen Tao", title = "{COMET}: a novel memory-efficient deep learning training framework by using error-bounded lossy compression", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "886--899", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503597", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503597", abstract = "Deep neural networks (DNNs) are becoming increasingly deeper, wider, and non-linear due to the growing demands on prediction accuracy and analysis quality. Training wide and deep neural networks require large amounts of storage resources such as memory \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2021:FMF, author = "Zitao Li and Bolin Ding and Ce Zhang and Ninghui Li and Jingren Zhou", title = "Federated matrix factorization with privacy guarantee", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "900--913", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503598", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503598", abstract = "Matrix factorization (MF) approximates unobserved ratings in a rating matrix, whose rows correspond to users and columns correspond to items to be rated, and has been serving as a fundamental building block in recommendation systems. This paper \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Duong:2021:SRG, author = "Chi Thang Duong and Trung Dung Hoang and Hongzhi Yin and Matthias Weidlich and Quoc Viet Hung Nguyen and Karl Aberer", title = "Scalable robust graph embedding with {Spark}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "914--922", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503599", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503599", abstract = "Graph embedding aims at learning a vector-based representation of vertices that incorporates the structure of the graph. This representation then enables inference of graph properties. Existing graph embedding techniques, however, do not scale well to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Paul:2021:DWC, author = "Debjyoti Paul and Jie Cao and Feifei Li and Vivek Srikumar", title = "Database workload characterization with query plan encoders", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "923--935", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503600", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503600", abstract = "Smart databases are adopting artificial intelligence (AI) technologies to achieve instance optimality, and in the future, databases will come with prepackaged AI models within their core components. The reason is that every database runs on different \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Modi:2021:NQO, author = "Abhishek Modi and Kaushik Rajan and Srinivas Thimmaiah and Prakhar Jain and Swinky Mann and Ayushi Agarwal and Ajith Shetty and Shahid K. I. and Ashit Gosalia and Partho Sarthi", title = "New query optimization techniques in the {Spark} engine of {Azure} synapse", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "936--948", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503601", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503601", abstract = "The cost of big-data query execution is dominated by stateful operators. These include sort and hash-aggregate that typically materialize intermediate data in memory, and exchange that materializes data to disk and transfers data over the network. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sinthong:2021:DDQ, author = "Phanwadee Sinthong and Dhaval Patel and Nianjun Zhou and Shrey Shrivastava and Arun Iyengar and Anuradha Bhamidipaty", title = "{DQDF}: data-quality-aware dataframes", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "949--957", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503602", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503602", abstract = "Data quality assessment is an essential process of any data analysis process including machine learning. The process is time-consuming as it involves multiple independent data quality checks that are performed iteratively at scale on evolving data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Agarwal:2021:RGC, author = "Archita Agarwal and Marilyn George and Aaron Jeyaraj and Malte Schwarzkopf", title = "Retrofitting {GDPR} compliance onto legacy databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "958--970", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503603", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503603", abstract = "New privacy laws like the European Union's General Data Protection Regulation (GDPR) require database administrators (DBAs) to identify all information related to an individual on request, e.g., to return or delete it. This requires time-consuming \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2021:AAC, author = "Xinle Wu and Dalin Zhang and Chenjuan Guo and Chaoyang He and Bin Yang and Christian S. Jensen", title = "{AutoCTS}: automated correlated time series forecasting", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "971--983", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503604", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503604", abstract = "Correlated time series (CTS) forecasting plays an essential role in many cyber-physical systems, where multiple sensors emit time series that capture interconnected processes. Solutions based on deep learning that deliver state-of-the-art CTS \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sudhir:2021:RLM, author = "Sivaprasad Sudhir and Michael Cafarella and Samuel Madden", title = "Replicated layout for in-memory database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "4", pages = "984--997", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3503585.3503606", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Apr 15 06:48:40 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3503585.3503606", abstract = "Scanning and filtering are the foundations of analytical database systems. Modern DBMSs employ a variety of techniques to partition and layout data to improve the performance of these operations. To accelerate query performance, systems tune data layout \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sanghi:2022:PCD, author = "Anupam Sanghi and Shadab Ahmed and Jayant R. Haritsa", title = "Projection-compliant database generation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "5", pages = "998--1010", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3510397.3510398", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed May 25 08:14:25 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3510397.3510398", abstract = "Synthesizing data using declarative formalisms has been persuasively advocated in contemporary data generation frameworks. In particular, they specify operator output volumes through row-cardinality constraints. However, thus far, adherence to these \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jin:2022:MRE, author = "Guodong Jin and Semih Salihoglu", title = "Making {RDBMSs} efficient on graph workloads through predefined joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "5", pages = "1011--1023", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3510397.3510400", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed May 25 08:14:25 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3510397.3510400", abstract = "Joins in native graph database management systems (GDBMSs) are predefined to the system as edges, which are indexed in adjacency list indices and serve as pointers. This contrasts with and can be more performant than value-based joins in RDBMSs. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Deep:2022:REJ, author = "Shaleen Deep and Xiao Hu and Paraschos Koutris", title = "Ranked enumeration of join queries with projections", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "5", pages = "1024--1037", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3510397.3510401", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed May 25 08:14:25 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3510397.3510401", abstract = "Join query evaluation with ordering is a fundamental data processing task in relational database management systems. SQL and custom graph query languages such as Cypher offer this functionality by allowing users to specify the order via the ORDER BY \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shin:2022:HSC, author = "Ahnjae Shin and Joo Seong Jeong and Do Yoon Kim and Soyoung Jung and Byung-Gon Chun", title = "{Hippo}: sharing computations in hyper-parameter optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "5", pages = "1038--1052", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3510397.3510402", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed May 25 08:14:25 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3510397.3510402", abstract = "Hyper-parameter optimization is crucial for pushing the accuracy of a deep learning model to its limits. However, a hyper-parameter optimization job, referred to as a study, involves numerous trials of training a model using different training knobs, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rinberg:2022:DJC, author = "Arik Rinberg and Tomer Solomon and Roee Shlomo and Guy Khazma and Gal Lushi and Idit Keidar and Paula Ta-Shma", title = "{DSON}: {JSON CRDT} using delta-mutations for document stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "5", pages = "1053--1065", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3510397.3510403", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed May 25 08:14:25 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3510397.3510403", abstract = "We propose DSON, a space efficient $ \delta $-based CRDT approach for distributed JSON document stores, enabling high availability at a global scale, while providing strong eventual consistency guarantees. We define the semantics of our CRDT based approach \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zeighami:2022:NDD, author = "Sepanta Zeighami and Ritesh Ahuja and Gabriel Ghinita and Cyrus Shahabi", title = "A neural database for differentially private spatial range queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "5", pages = "1066--1078", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3510397.3510404", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed May 25 08:14:25 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3510397.3510404", abstract = "Mobile apps and location-based services generate large amounts of location data. Location density information from such datasets benefits research on traffic optimization, context-aware notifications and public health (e.g., disease spread). To preserve \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Maltry:2022:CAR, author = "Marcel Maltry and Jens Dittrich", title = "A critical analysis of recursive model indexes", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "5", pages = "1079--1091", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3510397.3510405", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed May 25 08:14:25 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3510397.3510405", abstract = "The recursive model index (RMI) has recently been introduced as a machine-learned replacement for traditional indexes over sorted data, achieving remarkably fast lookups. Follow-up work focused on explaining RMI's performance and automatically \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ge:2022:HBD, author = "Zerui Ge and Dumitrel Loghin and Beng Chin Ooi and Pingcheng Ruan and Tianwen Wang", title = "Hybrid blockchain database systems: design and performance", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "5", pages = "1092--1104", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3510397.3510406", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed May 25 08:14:25 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3510397.3510406", abstract = "With the emergence of hybrid blockchain database systems, we aim to provide an in-depth analysis of the performance and trade-offs among a few representative systems. To achieve this goal, we implement Veritas and BlockchainDB from scratch. For Veritas, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bonifati:2022:TQT, author = "Angela Bonifati and Stefania Dumbrava and George Fletcher and Jan Hidders and Matthias Hofer and Wim Martens and Filip Murlak and Joshua Shinavier and S{\l}awek Staworko and Dominik Tomaszuk", title = "Threshold queries in theory and in the wild", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "5", pages = "1105--1118", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3510397.3510407", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed May 25 08:14:25 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3510397.3510407", abstract = "Threshold queries are an important class of queries that only require computing or counting answers up to a specified threshold value. To the best of our knowledge, threshold queries have been largely disregarded in the research literature, which is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sichert:2022:UDO, author = "Moritz Sichert and Thomas Neumann", title = "User-defined operators: efficiently integrating custom algorithms into modern databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "5", pages = "1119--1131", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3510397.3510408", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed May 25 08:14:25 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3510397.3510408", abstract = "In recent years, complex data mining and machine learning algorithms have become more common in data analytics. Several specialized systems exist to evaluate these algorithms on ever-growing data sets, which are built to efficiently execute different \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:PEP, author = "Yue Wang and Vivek Narasayya and Yeye He and Surajit Chaudhuri", title = "{PACk}: an efficient partition-based distributed agglomerative hierarchical clustering algorithm for deduplication", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "6", pages = "1132--1145", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3514061.3514062", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:17 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3514061.3514062", abstract = "The Agglomerative Hierarchical Clustering (AHC) algorithm is widely used in real-world applications. As data volumes continue to grow, efficient scale-out techniques for AHC are becoming increasingly important. In this paper, we propose a Partition-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chang:2022:NOA, author = "Lijun Chang and Zhiyi Wang", title = "A near-optimal approach to edge connectivity-based hierarchical graph decomposition", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "6", pages = "1146--1158", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3514061.3514063", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:17 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3514061.3514063", abstract = "Driven by applications in graph analytics, the problem of efficiently computing all k -edge connected components ( k -ECCs) of a graph G for a user-given k has been extensively and well studied. It is known that the k -ECCs of G for all possible values of k \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tong:2022:HFE, author = "Yongxin Tong and Xuchen Pan and Yuxiang Zeng and Yexuan Shi and Chunbo Xue and Zimu Zhou and Xiaofei Zhang and Lei Chen and Yi Xu and Ke Xu and Weifeng Lv", title = "{Hu-Fu}: efficient and secure spatial queries over data federation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "6", pages = "1159--1172", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3514061.3514064", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:17 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3514061.3514064", abstract = "Data isolation has become an obstacle to scale up query processing over big data, since sharing raw data among data owners is often prohibitive due to security concerns. A promising solution is to perform secure queries over a federation of multiple \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fuchs:2022:SUT, author = "Per Fuchs and Domagoj Margan and Jana Giceva", title = "{Sortledton}: a universal, transactional graph data structure", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "6", pages = "1173--1186", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3514061.3514065", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:17 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3514061.3514065", abstract = "Despite the wide adoption of graph processing across many different application domains, there is no underlying data structure that can serve a variety of graph workloads (analytics, traversals, and pattern matching) on dynamic graphs with transactional \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2022:NLF, author = "Bowen Zhang and Shengan Zheng and Zhenlin Qi and Linpeng Huang", title = "{NBTree}: a lock-free {PM}-friendly persistent {B+}-tree for {eADR}-enabled {PM} systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "6", pages = "1187--1200", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3514061.3514066", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:17 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3514061.3514066", abstract = "Persistent memory (PM) promises near-DRAM performance as well as data persistency. Recently, a new feature called eADR is available on the 2$^{nd}$ generation Intel Optane PM with the 3$^{rd}$ generation Intel Xeon Scalable Processors. eADR ensures that data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tuli:2022:TDT, author = "Shreshth Tuli and Giuliano Casale and Nicholas R. Jennings", title = "{TranAD}: deep transformer networks for anomaly detection in multivariate time series data", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "6", pages = "1201--1214", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3514061.3514067", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:17 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3514061.3514067", abstract = "Efficient anomaly detection and diagnosis in multivariate time-series data is of great importance for modern industrial applications. However, building a system that is able to quickly and accurately pinpoint anomalous observations is a challenging \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2022:SPO, author = "Fuheng Zhao and Divyakant Agrawal and Amr {El Abbadi} and Ahmed Metwally", title = "{SpaceSaving$ \pm $}: an optimal algorithm for frequency estimation and frequent items in the bounded-deletion model", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "6", pages = "1215--1227", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3514061.3514068", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:17 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", note = "See errata \cite{Zhao:2023:ESO}.", URL = "https://dl.acm.org/doi/10.14778/3514061.3514068", abstract = "In this paper, we propose the first deterministic algorithms to solve the frequency estimation and frequent item problems in the bounded-deletion model. We establish the space lower bound for solving the deterministic frequent items problem in the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zheng:2022:BEG, author = "Chenguang Zheng and Hongzhi Chen and Yuxuan Cheng and Zhezheng Song and Yifan Wu and Changji Li and James Cheng and Hao Yang and Shuai Zhang", title = "{ByteGNN}: efficient graph neural network training at large scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "6", pages = "1228--1242", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3514061.3514069", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:17 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3514061.3514069", abstract = "Graph neural networks (GNNs) have shown excellent performance in a wide range of applications such as recommendation, risk control, and drug discovery. With the increase in the volume of graph data, distributed GNN systems become essential to support \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jiang:2022:QDG, author = "Yuli Jiang and Yu Rong and Hong Cheng and Xin Huang and Kangfei Zhao and Junzhou Huang", title = "Query driven-graph neural networks for community search: from non-attributed, attributed, to interactive attributed", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "6", pages = "1243--1255", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3514061.3514070", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:17 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3514061.3514070", abstract = "Given one or more query vertices, Community Search (CS) aims to find densely intra-connected and loosely inter-connected structures containing query vertices. Attributed Community Search (ACS), a related problem, is more challenging since it finds \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2022:HTT, author = "Yang Li and Yu Shen and Huaijun Jiang and Wentao Zhang and Jixiang Li and Ji Liu and Ce Zhang and Bin Cui", title = "{Hyper-tune}: towards efficient hyper-parameter tuning at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "6", pages = "1256--1265", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3514061.3514071", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:17 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3514061.3514071", abstract = "The ever-growing demand and complexity of machine learning are putting pressure on hyper-parameter tuning systems: while the evaluation cost of models continues to increase, the scalability of state-of-the-arts starts to become a crucial bottleneck. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Minartz:2022:MCD, author = "Koen Minartz and Jens E. d'Hondt and Odysseas Papapetrou", title = "Multivariate correlations discovery in static and streaming data", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "6", pages = "1266--1278", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3514061.3514072", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:17 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3514061.3514072", abstract = "Correlation analysis is an invaluable tool in many domains, for better understanding data and extracting salient insights. Most works to date focus on detecting high pairwise correlations. A generalization of this problem with known applications but no \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Poppe:2022:MPA, author = "Olga Poppe and Qun Guo and Willis Lang and Pankaj Arora and Morgan Oslake and Shize Xu and Ajay Kalhan", title = "{Moneyball}: proactive auto-scaling in {Microsoft Azure SQL} database serverless", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "6", pages = "1279--1287", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3514061.3514073", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:17 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3514061.3514073", abstract = "Microsoft Azure SQL Database is among the leading relational database service providers in the cloud. Serverless compute automatically scales resources based on workload demand. When a database becomes idle its resources are reclaimed. When activity \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cheng:2022:PRP, author = "Kewei Cheng and Xian Li and Yifan Ethan Xu and Xin Luna Dong and Yizhou Sun", title = "{PGE}: robust product graph embedding learning for error detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "6", pages = "1288--1296", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3514061.3514074", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:17 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3514061.3514074", abstract = "Although product graphs (PGs) have gained increasing attentions in recent years for their successful applications in product search and recommendations, the extensive power of PGs can be limited by the inevitable involvement of various kinds of errors. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Manne:2022:CMR, author = "Naga Nithin Manne and Shilvi Satpati and Tanu Malik and Amitabha Bagchi and Ashish Gehani and Amitabh Chaudhary", title = "{CHEX}: multiversion replay with ordered checkpoints", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "6", pages = "1297--1310", month = feb, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3514061.3514075", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:17 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3514061.3514075", abstract = "In scientific computing and data science disciplines, it is often necessary to share application workflows and repeat results. Current tools containerize application workflows, and share the resulting container for repeating results. These tools, due to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Even:2022:PFP, author = "Tomer Even and Guy Even and Adam Morrison", title = "Prefix filter: practically and theoretically better than bloom", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "7", pages = "1311--1323", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3523210.3523211", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:18 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3523210.3523211", abstract = "Many applications of approximate membership query data structures, or filters, require only an incremental filter that supports insertions but not deletions. However, the design space of incremental filters is missing a ``sweet spot'' filter that combines \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yamada:2022:SDS, author = "Hiroyuki Yamada and Jun Nemoto", title = "{Scalar DL}: scalable and practical {Byzantine} fault detection for transactional database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "7", pages = "1324--1336", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3523210.3523212", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:18 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3523210.3523212", abstract = "This paper presents Scalar DL, a Byzantine fault detection (BFD) middleware for transactional database systems. Scalar DL manages two separately administered database replicas in a database system and can detect Byzantine faults in the database system \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kim:2022:NLR, author = "Gyuyeong Kim and Wonjun Lee", title = "In-network leaderless replication for distributed data stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "7", pages = "1337--1349", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3523210.3523213", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:18 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3523210.3523213", abstract = "Leaderless replication allows any replica to handle any type of request to achieve read scalability and high availability for distributed data stores. However, this entails burdensome coordination overhead of replication protocols, degrading write \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2022:FAC, author = "Xin Sun and Xin Huang and Di Jin", title = "Fast algorithms for core maximization on large graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "7", pages = "1350--1362", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3523210.3523214", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:18 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3523210.3523214", abstract = "Core maximization, that enlarges the k -core as much as possible by inserting a few new edges into a graph, is particularly useful for social group engagement and network stability improvement. However, the core maximization problem has been \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pan:2022:NSC, author = "Shuye Pan and Peng Wang and Chen Wang and Wei Wang and Jianmin Wang", title = "{NLC}: search correlated window pairs on long time series", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "7", pages = "1363--1375", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3523210.3523215", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:18 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3523210.3523215", abstract = "Nowadays, many applications, like Internet of Things and Industrial Internet, collect data points from sensors continuously to form long time series. Finding correlation between time series is a fundamental task for many time series mining problems. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:EBL, author = "Hanzhi Wang and Zhewei Wei and Junhao Gan and Ye Yuan and Xiaoyong Du and Ji-Rong Wen", title = "Edge-based local push for personalized {PageRank}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "7", pages = "1376--1389", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3523210.3523216", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:18 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3523210.3523216", abstract = "Personalized PageRank (PPR) is a popular node proximity metric in graph mining and network research. A single-source PPR (SSPPR) query asks for the PPR value of each node on the graph. Due to its importance and wide applications, decades of efforts have \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chan:2022:CSD, author = "Harry Kai-Ho Chan and Huan Li and Xiao Li and Hua Lu", title = "Continuous social distance monitoring in indoor space", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "7", pages = "1390--1402", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3523210.3523217", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:18 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3523210.3523217", abstract = "The COVID-19 pandemic has caused over 6 million deaths since 2020. To contain the spread of the virus, social distancing is one of the most simple yet effective approaches. Motivated by this, in this paper we study the problem of continuous social \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2022:DSC, author = "Xibo Sun and Shixuan Sun and Qiong Luo and Bingsheng He", title = "An in-depth study of continuous subgraph matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "7", pages = "1403--1416", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3523210.3523218", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:18 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3523210.3523218", abstract = "Continuous subgraph matching (CSM) algorithms find the occurrences of a given pattern on a stream of data graphs online. A number of incremental CSM algorithms have been proposed. However, a systematical study on these algorithms is missing to identify \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mishra:2022:OST, author = "Abhinav Mishra and Ram Sriharsha and Sichen Zhong", title = "{OnlineSTL}: scaling time series decomposition by $ 100 \times $", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "7", pages = "1417--1425", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3523210.3523219", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:18 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3523210.3523219", abstract = "Decomposing a complex time series into trend, seasonality, and remainder components is an important primitive that facilitates time series anomaly detection, change point detection, and forecasting. Although numerous batch algorithms are known for time \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2022:SSS, author = "Haoyu Li and Qizhi Chen and Yixin Zhang and Tong Yang and Bin Cui", title = "{Stingy sketch}: a sketch framework for accurate and fast frequency estimation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "7", pages = "1426--1438", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3523210.3523220", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:18 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3523210.3523220", abstract = "Recording the frequency of items in highly skewed data streams is a fundamental and hot problem in recent years. The literature demonstrates that sketch is the most promising solution. The typical metrics to measure a sketch are accuracy and speed, but \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:SDP, author = "Yang Wang and Miao Yu and Yujie Hui and Fang Zhou and Yuyang Huang and Rui Zhu and Xueyuan Ren and Tianxi Li and Xiaoyi Lu", title = "A study of database performance sensitivity to experiment settings", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "7", pages = "1439--1452", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3523210.3523221", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:18 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3523210.3523221", abstract = "To allow performance comparison across different systems, our community has developed multiple benchmarks, such as TPC-C and YCSB, which are widely used. However, despite such effort, interpreting and comparing performance numbers is still a challenging \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chao:2022:ITC, author = "Zemin Chao and Hong Gao and Yinan An and Jianzhong Li", title = "The inherent time complexity and an efficient algorithm for subsequence matching problem", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "7", pages = "1453--1465", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3523210.3523222", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:18 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3523210.3523222", abstract = "Subsequence matching is an important and fundamental problem on time series data. This paper studies the inherent time complexity of the subsequence matching problem and designs a more efficient algorithm for solving the problem. Firstly, it is proved \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chai:2022:SDA, author = "Chengliang Chai and Jiabin Liu and Nan Tang and Guoliang Li and Yuyu Luo", title = "Selective data acquisition in the wild for model charging", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "7", pages = "1466--1478", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3523210.3523223", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:18 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3523210.3523223", abstract = "The lack of sufficient labeled data is a key bottleneck for practitioners in many real-world supervised machine learning (ML) tasks. In this paper, we study a new problem, namely selective data acquisition in the wild for model charging: given a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2022:DAR, author = "Wenfei Fan and Wenzhi Fu and Ruochun Jin and Ping Lu and Chao Tian", title = "Discovering association rules from big graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "7", pages = "1479--1492", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3523210.3523224", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:18 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3523210.3523224", abstract = "This paper tackles two challenges to discovery of graph rules. Existing discovery methods often (a) return an excessive number of rules, and (b) do not scale with large graphs given the intractability of the discovery problem. We propose an application-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Han:2022:DEE, author = "Xiaolin Han and Reynold Cheng and Chenhao Ma and Tobias Grubenmann", title = "{DeepTEA}: effective and efficient online time-dependent trajectory outlier detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "7", pages = "1493--1505", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3523210.3523225", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:18 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3523210.3523225", abstract = "In this paper, we study anomalous trajectory detection, which aims to extract abnormal movements of vehicles on the roads. This important problem, which facilitates understanding of traffic behavior and detection of taxi fraud, is challenging due to the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Simonini:2022:ERD, author = "Giovanni Simonini and Luca Zecchini and Sonia Bergamaschi and Felix Naumann", title = "Entity resolution on-demand", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "7", pages = "1506--1518", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3523210.3523226", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:18 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3523210.3523226", abstract = "Entity Resolution (ER) aims to identify and merge records that refer to the same real-world entity. ER is typically employed as an expensive cleaning step on the entire data before consuming it. Yet, determining which entities are useful once cleaned \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Alhazmi:2022:FBC, author = "Afnan Alhazmi and Tom Blount and George Konstantinidis", title = "{ForBackBench}: a benchmark for chasing vs. query-rewriting", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1519--1532", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529338", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529338", abstract = "The problems of Data Integration/Exchange (DE) and Ontology Based Data Access (OBDA) have been extensively studied across different communities. The underlying problem is common: using a number of differently structured data-sources mapped to a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2022:ASB, author = "Jeremy Chen and Yuqing Huang and Mushi Wang and Semih Salihoglu and Ken Salem", title = "Accurate summary-based cardinality estimation through the lens of cardinality estimation graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1533--1545", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529339", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529339", abstract = "This paper is an experimental and analytical study of two classes of summary-based cardinality estimators that use statistics about input relations and small-size joins in the context of graph database management systems: (i) optimistic estimators that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liao:2022:DDC, author = "Xuankun Liao and Qing Liu and Jiaxin Jiang and Xin Huang and Jianliang Xu and Byron Choi", title = "Distributed {D-core} decomposition over large directed graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1546--1558", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529340", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529340", abstract = "Given a directed graph $G$ and integers $k$ and $l$, a D-core is the maximal subgraph $ H \subseteq G$ such that for every vertex of $H$, its in-degree and out-degree are no smaller than $k$ and $l$, respectively. For a directed graph $G$, the problem of D-core decomposition \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2022:EMB, author = "Lu Chen and Chengfei Liu and Rui Zhou and Jiajie Xu and Jianxin Li", title = "Efficient maximal biclique enumeration for large sparse bipartite graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1559--1571", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529341", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529341", abstract = "Maximal bicliques are effective to reveal meaningful information hidden in bipartite graphs. Maximal biclique enumeration (MBE) is challenging since the number of the maximal bicliques grows exponentially w.r.t. the number of vertices in a bipartite \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2022:TGF, author = "Hongkuan Zhou and Da Zheng and Israt Nisa and Vasileios Ioannidis and Xiang Song and George Karypis", title = "{TGL}: a general framework for temporal {GNN} training on billion-scale graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1572--1580", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529342", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529342", abstract = "Many real world graphs contain time domain information. Temporal Graph Neural Networks capture temporal information as well as structural and contextual information in the generated dynamic node embeddings. Researchers have shown that these embeddings \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yuan:2022:DLF, author = "Binhang Yuan and Cameron R. Wolfe and Chen Dun and Yuxin Tang and Anastasios Kyrillidis and Chris Jermaine", title = "Distributed learning of fully connected neural networks using independent subnet training", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1581--1590", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529343", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529343", abstract = "Distributed machine learning (ML) can bring more computational resources to bear than single-machine learning, thus enabling reductions in training time. Distributed learning partitions models and data over many machines, allowing model and dataset \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Burckhardt:2022:NEE, author = "Sebastian Burckhardt and Badrish Chandramouli and Chris Gillum and David Justo and Konstantinos Kallas and Connor McMahon and Christopher S. Meiklejohn and Xiangfeng Zhu", title = "{Netherite}: efficient execution of serverless workflows", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1591--1604", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529344", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529344", abstract = "Serverless is a popular choice for cloud service architects because it can provide scalability and load-based billing with minimal developer effort. Functions-as-a-service (FaaS) are originally stateless, but emerging frameworks add stateful \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huynh:2022:ERT, author = "Andy Huynh and Harshal A. Chaudhari and Evimaria Terzi and Manos Athanassoulis", title = "{Endure}: a robust tuning paradigm for {LSM} trees under workload uncertainty", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1605--1618", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529345", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529345", abstract = "Log-Structured Merge trees (LSM trees) are increasingly used as the storage engines behind several data systems, frequently deployed in the cloud. Similar to other database architectures, LSM trees consider information about the expected workload (e.g., \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2022:EDB, author = "Hongzheng Li and Yingxia Shao and Junping Du and Bin Cui and Lei Chen", title = "An {I/O}-efficient disk-based graph system for scalable second-order random walk of large graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1619--1631", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529346", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529346", abstract = "Random walk is widely used in many graph analysis tasks, especially the first-order random walk. However, as a simplification of real-world problems, the first-order random walk is poor at modeling higher-order structures in the data. Recently, second-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Vaidya:2022:SLE, author = "Kapil Vaidya and Subarna Chatterjee and Eric Knorr and Michael Mitzenmacher and Stratos Idreos and Tim Kraska", title = "{SNARF}: a learning-enhanced range filter", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1632--1644", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529347", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529347", abstract = "We present Sparse Numerical Array-Based Range Filters (SNARF), a learned range filter that efficiently supports range queries for numerical data. SNARF creates a model of the data distribution to map the keys into a bit array which is stored in a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2022:DEI, author = "Xin Chen and You Peng and Sibo Wang and Jeffrey Xu Yu", title = "{DLCR}: efficient indexing for label-constrained reachability queries on large dynamic graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1645--1657", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529348", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529348", abstract = "Many real-world graphs, e.g., social networks, biological networks, knowledge graphs, naturally come with edge-labels, with different labels representing different relationships between nodes. On such edge-labeled graphs, an important query is the label-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2022:QTT, author = "Yue Zhao and Gao Cong and Jiachen Shi and Chunyan Miao", title = "{QueryFormer}: a tree transformer model for query plan representation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1658--1670", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529349", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529349", abstract = "Machine learning has become a prominent method in many database optimization problems such as cost estimation, index selection and query optimization. Translating query execution plans into their vectorized representations is non-trivial. Recently, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lee:2022:ICI, author = "Leon Lee and Siphrey Xie and Yunus Ma and Shimin Chen", title = "Index checkpoints for instant recovery in in-memory database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1671--1683", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529350", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529350", abstract = "We observe that the time bottleneck during the recovery phase of an IMDB (In-Memory DataBase system) shifts from log replaying to index rebuilding after the state-of-art techniques for instant recovery have been applied. In this paper, we investigate \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Esmailoghli:2022:MMA, author = "Mahdi Esmailoghli and Jorge-Arnulfo Quian{\'e}-Ruiz and Ziawasch Abedjan", title = "{MATE}: multi-attribute table extraction", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1684--1696", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529353", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529353", abstract = "A core operation in data discovery is to find joinable tables for a given table. Real-world tables include both unary and n-ary join keys. However, existing table discovery systems are optimized for unary joins and are ineffective and slow in the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Paparrizos:2022:TUE, author = "John Paparrizos and Yuhao Kang and Paul Boniol and Ruey S. Tsay and Themis Palpanas and Michael J. Franklin", title = "{TSB-UAD}: an end-to-end benchmark suite for univariate time-series anomaly detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1697--1711", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529354", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529354", abstract = "The detection of anomalies in time series has gained ample academic and industrial attention. However, no comprehensive benchmark exists to evaluate time-series anomaly detection methods. It is common to use (i) proprietary or synthetic data, often \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Leone:2022:CRE, author = "Manuel Leone and Stefano Huber and Akhil Arora and Alberto Garc{\'\i}a-Dur{\'a}n and Robert West", title = "A critical re-evaluation of neural methods for entity alignment", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1712--1725", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529355", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529355", abstract = "Neural methods have become the de-facto choice for the vast majority of data analysis tasks, and entity alignment (EA) is no exception. Not surprisingly, more than 50 different neural EA methods have been published since 2017. However, surprisingly, an \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Paganelli:2022:AHB, author = "Matteo Paganelli and Francesco {Del Buono} and Andrea Baraldi and Francesco Guerra", title = "Analyzing how {BERT} performs entity matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "8", pages = "1726--1738", month = apr, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3529337.3529356", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 24 09:22:19 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3529337.3529356", abstract = "State-of-the-art Entity Matching (EM) approaches rely on transformer architectures, such as BERT, for generating highly contex-tualized embeddings of terms. The embeddings are then used to predict whether pairs of entity descriptions refer to the same \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Arun:2022:SBF, author = "Balaji Arun and Binoy Ravindran", title = "Scalable {Byzantine} fault tolerance via partial decentralization", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1739--1752", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538599", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538599", abstract = "Byzantine consensus is a critical component in many permissioned Blockchains and distributed ledgers. We propose a new paradigm for designing BFT protocols called DQBFT that addresses three major performance and scalability challenges that plague past \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2022:EEB, author = "Huan Li and Lanjing Yi and Bo Tang and Hua Lu and Christian S. Jensen", title = "Efficient and error-bounded spatiotemporal quantile monitoring in edge computing environments", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1753--1765", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538600", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538600", abstract = "Underlying many types of data analytics, a spatiotemporal quantile monitoring (SQM) query continuously returns the quantiles of a dataset observed in a spatiotemporal range. In this paper, we study SQM in an Internet of Things (IoT) based edge computing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kato:2022:HDP, author = "Fumiyuki Kato and Tsubasa Takahashi and Shun Takagi and Yang Cao and Seng Pei Liew and Masatoshi Yoshikawa", title = "{HDPView}: differentially private materialized view for exploring high dimensional relational data", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1766--1778", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538601", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538601", abstract = "How can we explore the unknown properties of high-dimensional sensitive relational data while preserving privacy? We study how to construct an explorable privacy-preserving materialized view under differential privacy. No existing state-of-the-art \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schmidl:2022:ADT, author = "Sebastian Schmidl and Phillip Wenig and Thorsten Papenbrock", title = "Anomaly detection in time series: a comprehensive evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1779--1797", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538602", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538602", abstract = "Detecting anomalous subsequences in time series data is an important task in areas ranging from manufacturing processes over finance applications to health care monitoring. An anomaly can indicate important events, such as production faults, delivery \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Youngmann:2022:GED, author = "Brit Youngmann and Sihem Amer-Yahia and Aurelien Personnaz", title = "Guided exploration of data summaries", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1798--1807", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538603", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538603", abstract = "Data summarization is the process of producing interpretable and representative subsets of an input dataset. It is usually performed following a one-shot process with the purpose of finding the best summary. A useful summary contains k individually \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2022:FDT, author = "Xinyi Zhang and Zhuo Chang and Yang Li and Hong Wu and Jian Tan and Feifei Li and Bin Cui", title = "Facilitating database tuning with hyper-parameter optimization: a comprehensive experimental evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1808--1821", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538604", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538604", abstract = "Recently, using automatic configuration tuning to improve the performance of modern database management systems (DBMSs) has attracted increasing interest from the database community. This is embodied with a number of systems featuring advanced tuning \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:ESV, author = "Zuan Wang and Xiaofeng Ding and Hai Jin and Pan Zhou", title = "Efficient secure and verifiable location-based skyline queries over encrypted data", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1822--1834", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538605", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538605", abstract = "Supporting secure location-based services on encrypted data that is outsourced to cloud computing platforms remains an ongoing challenge for efficiency due to expensive ciphertext calculation overhead. Furthermore, since the clouds may not be \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2022:TIC, author = "Zhuoyue Zhao and Dong Xie and Feifei Li", title = "{AB-tree}: index for concurrent random sampling and updates", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1835--1847", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538606", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538606", abstract = "There has been an increasing demand for real-time data analytics. Approximate Query Processing (AQP) is a popular option for that because it can use random sampling to trade some accuracy for lower query latency. However, the state-of-the-art AQP system \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fang:2022:RTR, author = "Chenguang Fang and Shaoxu Song and Yinan Mei", title = "On repairing timestamps for regular interval time series", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1848--1860", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538607", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538607", abstract = "Time series data are often with regular time intervals, e.g., in IoT scenarios sensor data collected with a pre-specified frequency, air quality data regularly recorded by outdoor monitors, and GPS signals periodically received from multiple satellites. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2022:TEP, author = "Wenfei Fan and Ruochun Jin and Ping Lu and Chao Tian and Ruiqi Xu", title = "Towards event prediction in temporal graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1861--1874", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538608", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538608", abstract = "This paper proposes a class of temporal association rules, denoted by TACOs, for event prediction. As opposed to previous graph rules, TACOs monitor updates to graphs, and can be used to capture temporal interests in recommendation and catch frauds in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liang:2022:DCH, author = "Yihuai Liang and Yan Li and Byeong-Seok Shin", title = "Decentralized crowdsourcing for human intelligence tasks with efficient on-chain cost", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1875--1888", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538609", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538609", abstract = "Crowdsourcing for Human Intelligence Tasks (HIT) has been widely used to crowdsource human knowledge, such as image annotation for machine learning. We use a public blockchain to play the role of traditional centralized HIT systems, such that the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:TDB, author = "Yue Wang and Ruiqi Xu and Xun Jian and Alexander Zhou and Lei Chen", title = "Towards distributed bitruss decomposition on bipartite graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1889--1901", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538610", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538610", abstract = "Mining cohesive subgraphs on bipartite graphs is an important task. The k -bitruss is one of many popular cohesive subgraph models, which is the maximal subgraph where each edge is contained in at least k butterflies. The bitruss decomposition problem is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gagliardelli:2022:GSM, author = "Luca Gagliardelli and George Papadakis and Giovanni Simonini and Sonia Bergamaschi and Themis Palpanas", title = "Generalized supervised meta-blocking", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1902--1910", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538611", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538611", abstract = "Entity Resolution is a core data integration task that relies on Blocking to scale to large datasets. Schema-agnostic blocking achieves very high recall, requires no domain knowledge and applies to data of any structuredness and schema heterogeneity. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{An:2022:YRO, author = "Mijin An and Soojun Im and Dawoon Jung and Sang-Won Lee", title = "Your read is our priority in flash storage", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1911--1923", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538612", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538612", abstract = "When replacing a dirty victim page upon page miss, the conventional buffer managers flush the dirty victim first to the storage before reading the missing page. This read-after-write (RAW) protocol, unfortunately, causes the read stall problem on flash \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bhattacharya:2022:NWO, author = "Arindam Bhattacharya and Chathur Gudesa and Amitabha Bagchi and Srikanta Bedathur", title = "New wine in an old bottle: data-aware hash functions for {Bloom} filters", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1924--1936", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538613", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538613", abstract = "In many applications of Bloom filters, it is possible to exploit the patterns present in the inserted and non-inserted keys to achieve more compression than the standard Bloom filter. A new class of Bloom filters called Learned Bloom filters use machine \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Peng:2022:SEA, author = "Jingshu Peng and Zhao Chen and Yingxia Shao and Yanyan Shen and Lei Chen and Jiannong Cao", title = "{Sancus}: staleness-aware communication-avoiding full-graph decentralized training in large-scale graph neural networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1937--1950", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538614", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538614", abstract = "Graph neural networks (GNNs) have emerged due to their success at modeling graph data. Yet, it is challenging for GNNs to efficiently scale to large graphs. Thus, distributed GNNs come into play. To avoid communication caused by expensive data movement \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bucchi:2022:CCE, author = "Marco Bucchi and Alejandro Grez and Andr{\'e}s Quintana and Cristian Riveros and Stijn Vansummeren", title = "{CORE}: a complex event recognition engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1951--1964", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538615", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538615", abstract = "Complex Event Recognition (CER) systems are a prominent technology for finding user-defined query patterns over large data streams in real time. CER query evaluation is known to be computationally challenging, since it requires maintaining a set of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cheng:2022:TEE, author = "Audrey Cheng and Xiao Shi and Aaron Kabcenell and Shilpa Lawande and Hamza Qadeer and Jason Chan and Harrison Tin and Ryan Zhao and Peter Bailis and Mahesh Balakrishnan and Nathan Bronson and Natacha Crooks and Ion Stoica", title = "{TAOBench}: an end-to-end benchmark for social network workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "9", pages = "1965--1977", month = may, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3538598.3538616", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Jul 28 06:16:23 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3538598.3538616", abstract = "The continued emergence of large social network applications has introduced a scale of data and query volume that challenges the limits of existing data stores. However, few benchmarks accurately simulate these request patterns, leaving researchers in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kakaraparthy:2022:VHA, author = "Aarati Kakaraparthy and Jignesh M. Patel and Brian P. Kroth and Kwanghyun Park", title = "{VIP} hashing: adapting to skew in popularity of data on the fly", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "1978--1990", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547306", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547306", abstract = "All data is not equally popular. Often, some portion of data is more frequently accessed than the rest, which causes a skew in popularity of the data items. Adapting to this skew can improve performance, and this topic has been studied extensively in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Vincon:2022:NDP, author = "Tobias Vin{\c{c}}on and Christian Kn{\"o}dler and Leonardo Solis-Vasquez and Arthur Bernhardt and Sajjad Tamimi and Lukas Weber and Florian Stock and Andreas Koch and Ilia Petrov", title = "Near-data processing in database systems on native computational storage under {HTAP} workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "1991--2004", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547307", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547307", abstract = "Today's Hybrid Transactional and Analytical Processing (HTAP) systems, tackle the ever-growing data in combination with a mixture of transactional and analytical workloads. While optimizing for aspects such as data freshness and performance isolation, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Echihabi:2022:HAD, author = "Karima Echihabi and Panagiota Fatourou and Kostas Zoumpatianos and Themis Palpanas and Houda Benbrahim", title = "{Hercules} against data series similarity search", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2005--2018", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547308", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547308", abstract = "We propose Hercules, a parallel tree-based technique for exact similarity search on massive disk-based data series collections. We present novel index construction and query answering algorithms that leverage different summarization techniques, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Siddiqui:2022:DLO, author = "Tarique Siddiqui and Wentao Wu and Vivek Narasayya and Surajit Chaudhuri", title = "{DISTILL}: low-overhead data-driven techniques for filtering and costing indexes for scalable index tuning", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2019--2031", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547309", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547309", abstract = "Many database systems offer index tuning tools that help automatically select appropriate indexes for improving the performance of an input workload. Index tuning is a resource-intensive and time-consuming task requiring expensive optimizer calls for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2022:OML, author = "Zhihui Yang and Zuozhi Wang and Yicong Huang and Yao Lu and Chen Li and X. Sean Wang", title = "Optimizing machine learning inference queries with correlative proxy models", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2032--2044", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547310", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547310", abstract = "We consider accelerating machine learning (ML) inference queries on unstructured datasets. Expensive operators such as feature extractors and classifiers are deployed as user-defined functions (UDFs), which are not penetrable with classic query \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Su:2022:BSD, author = "Li Su and Xiaoming Qin and Zichao Zhang and Rui Yang and Le Xu and Indranil Gupta and Wenyuan Yu and Kai Zeng and Jingren Zhou", title = "{Banyan}: a scoped dataflow engine for graph query service", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2045--2057", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547311", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547311", abstract = "Graph query services (GQS) are widely used today to interactively answer graph traversal queries on large-scale graph data. Existing graph query engines focus largely on optimizing the latency of a single query. This ignores significant challenges posed \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2022:FEU, author = "Ziyue Huang and Yuan Qiu and Ke Yi and Graham Cormode", title = "Frequency estimation under multiparty differential privacy: one-shot and streaming", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2058--2070", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547312", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547312", abstract = "We study the fundamental problem of frequency estimation under both privacy and communication constraints, where the data is distributed among k parties. We consider two application scenarios: (1) one-shot, where the data is static and the aggregator \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ali:2022:OIS, author = "Ahsan Ali and Riccardo Pinciroli and Feng Yan and Evgenia Smirni", title = "Optimizing inference serving on serverless platforms", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2071--2084", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547313", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547313", abstract = "Serverless computing is gaining popularity for machine learning (ML) serving workload due to its autonomous resource scaling, easy to use and pay-per-use cost model. Existing serverless platforms work well for image-based ML inference, where requests \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Alkowaileet:2022:CFS, author = "Wail Y. Alkowaileet and Michael J. Carey", title = "Columnar formats for schemaless {LSM}-based document stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2085--2097", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547314", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547314", abstract = "In the last decade, document store database systems have gained more traction for storing and querying large volumes of semi-structured data. However, the flexibility of the document stores' data models has limited their ability to store data in a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qiu:2022:ESP, author = "Yu-Xuan Qiu and Dong Wen and Lu Qin and Wentao Li and Rong-Hua Li and Ying Zhang", title = "Efficient shortest path counting on large road networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2098--2110", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547315", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547315", abstract = "The shortest path distance and related concepts lay the foundations of many real-world applications in road network analysis. The shortest path count has drawn much research attention in academia, not only as a closeness metric accompanying the shorted \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fu:2022:TCE, author = "Fangcheng Fu and Xupeng Miao and Jiawei Jiang and Huanran Xue and Bin Cui", title = "Towards communication-efficient vertical federated learning training via cache-enabled local updates", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2111--2120", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547316", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547316", abstract = "Vertical federated learning (VFL) is an emerging paradigm that allows different parties (e.g., organizations or enterprises) to collaboratively build machine learning models with privacy protection. In the training phase, VFL only exchanges the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2022:DED, author = "Yifan Zhu and Lu Chen and Yunjun Gao and Baihua Zheng and Pengfei Wang", title = "{DESIRE}: an efficient dynamic cluster-based forest indexing for similarity search in multi-metric spaces", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2121--2133", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547317", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547317", abstract = "Similarity search finds similar objects for a given query object based on a certain similarity metric. Similarity search in metric spaces has attracted increasing attention, as the metric space can accommodate any type of data and support flexible \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kim:2022:AAB, author = "Junghoon Kim and Kaiyu Feng and Gao Cong and Diwen Zhu and Wenyuan Yu and Chunyan Miao", title = "{ABC}: attributed bipartite co-clustering", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2134--2147", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547318", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547318", abstract = "Finding a set of co-clusters in a bipartite network is a fundamental and important problem. In this paper, we present the Attributed Bipartite Co-clustering (ABC) problem which unifies two main concepts: (i) bipartite modularity optimization, and (ii) \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xiao:2022:TSD, author = "Jinzhao Xiao and Yuxiang Huang and Changyu Hu and Shaoxu Song and Xiangdong Huang and Jianmin Wang", title = "Time series data encoding for efficient storage: a comparative analysis in {Apache IoTDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2148--2160", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547319", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547319", abstract = "Not only the vast applications but also the distinct features of time series data stimulate the booming growth of time series database management systems, such as Apache IoTDB, InfluxDB, OpenTSDB and so on. Almost all these systems employ columnar \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2022:SLO, author = "Teng Zhang and Jian Tan and Xin Cai and Jianying Wang and Feifei Li and Jianling Sun", title = "{SA-LSM}: optimize data layout for {LSM}-tree based storage using survival analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2161--2174", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547320", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547320", abstract = "A significant fraction of data in cloud storage is rarely accessed, referred to as cold data. Accurately identifying and efficiently managing cold data on cost-effective storages is one of the major challenges for cloud providers, which balances between \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ferragina:2022:IMV, author = "Paolo Ferragina and Giovanni Manzini and Travis Gagie and Dominik K{\"o}ppl and Gonzalo Navarro and Manuel Striani and Francesco Tosoni", title = "Improving matrix-vector multiplication via lossless grammar-compressed matrices", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2175--2187", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547321", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547321", abstract = "As nowadays Machine Learning (ML) techniques are generating huge data collections, the problem of how to efficiently engineer their storage and operations is becoming of paramount importance. In this article we propose a new lossless compression scheme \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2022:NRL, author = "Shangyu Wu and Yufei Cui and Jinghuan Yu and Xuan Sun and Tei-Wei Kuo and Chun Jason Xue", title = "{NFL}: robust learned index via distribution transformation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2188--2200", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547322", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547322", abstract = "Recent works on learned index open a new direction for the indexing field. The key insight of the learned index is to approximate the mapping between keys and positions with piece-wise linear functions. Such methods require partitioning key space for a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zare:2022:LLG, author = "Hamidreza Zare and Viveck Ramesh Cadambe and Bhuvan Urgaonkar and Nader Alfares and Praneet Soni and Chetan Sharma and Arif A. Merchant", title = "{LEGOStore}: a linearizable geo-distributed store combining replication and erasure coding", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2201--2215", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547323", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547323", abstract = "We design and implement LEGOStore, an erasure coding (EC) based linearizable data store over geo-distributed public cloud data centers (DCs). For such a data store, the confluence of the following factors opens up opportunities for EC to be latency-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Simpson:2022:MMU, author = "Michael Simpson and Farnoosh Hashemi and Laks V. S. Lakshmanan", title = "Misinformation mitigation under differential propagation rates and temporal penalties", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2216--2229", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547324", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547324", abstract = "We propose an information propagation model that captures important temporal aspects that have been well observed in the dynamics of fake news diffusion, in contrast with the diffusion of truth. The model accounts for differential propagation rates of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2022:SDL, author = "Lixi Zhou and Jiaqing Chen and Amitabh Das and Hong Min and Lei Yu and Ming Zhao and Jia Zou", title = "Serving deep learning models with deduplication from relational databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2230--2243", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547325", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547325", abstract = "Serving deep learning models from relational databases brings significant benefits. First, features extracted from databases do not need to be transferred to any decoupled deep learning systems for inferences, and thus the system management overhead can \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2022:DOI, author = "Zichun Huang and Shimin Chen", title = "Density-optimized intersection-free mapping and matrix multiplication for join-project operations", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2244--2256", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547326", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547326", abstract = "A Join-Project operation is a join operation followed by a duplicate eliminating projection operation. It is used in a large variety of applications, including entity matching, set analytics, and graph analytics. Previous work proposes a hybrid design \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jahangiri:2022:DTO, author = "Shiva Jahangiri and Michael J. Carey and Johann-Christoph Freytag", title = "Design trade-offs for a robust dynamic hybrid hash join", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2257--2269", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547327", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547327", abstract = "Hybrid Hash Join (HHJ) has proven to be one of the most efficient and widely-used join algorithms. While HHJ's performance depends largely on accurate statistics and information about the input relations, it may not always be practical or possible for a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Foufoulas:2022:YYE, author = "Yannis Foufoulas and Alkis Simitsis and Lefteris Stamatogiannakis and Yannis Ioannidis", title = "{YeSQL}: ``you extend {SQL}'' with rich and highly performant user-defined functions in relational databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2270--2283", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547328", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547328", abstract = "The diversity and complexity of modern data management applications have led to the extension of the relational paradigm with syntactic and semantic support for User-Defined Functions (UDFs). Although well-established in traditional DBMS settings, UDFs \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ahmetaj:2022:MSS, author = "Shqiponja Ahmetaj and Bianca L{\"o}hnert and Magdalena Ortiz and Mantas Simkus", title = "Magic shapes for {SHACL} validation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "10", pages = "2284--2296", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3547305.3547329", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Sep 8 11:58:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3547305.3547329", abstract = "A key prerequisite for the successful adoption of the Shapes Constraint Language (SHACL)---the W3C standardized constraint language for RDF graphs---is the availability of automated tools that efficiently validate targeted constraints (known as shapes \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Merchant:2022:SGR, author = "Arpit Merchant and Aristides Gionis and Michael Mathioudakis", title = "Succinct graph representations as distance oracles: an experimental evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2297--2306", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551794", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551794", abstract = "Distance oracles answer shortest-path queries between any pair of nodes in a graph. They are often built using succinct graph representations such as spanners, sketches, and compressors to minimize oracle size and query answering latency. Node \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jiang:2022:ECS, author = "Yangqin Jiang and Yixiang Fang and Chenhao Ma and Xin Cao and Chunshan Li", title = "Effective community search over large star-schema heterogeneous information networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2307--2320", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551795", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551795", abstract = "Community search (CS) enables personalized community discovery and has found a wide spectrum of emerging applications such as setting up social events and friend recommendation. While CS has been extensively studied for conventional homogeneous networks,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ting:2022:NDT, author = "Kai Ming Ting and Zongyou Liu and Hang Zhang and Ye Zhu", title = "A new distributional treatment for time series and an anomaly detection investigation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2321--2333", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551796", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551796", abstract = "Time series is traditionally treated with two main approaches, i.e., the time domain approach and the frequency domain approach. These approaches must rely on a sliding window so that time-shift versions of a periodic subsequence can be measured to be \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Denham:2022:WUL, author = "Benjamin Denham and Edmund M-K. Lai and Roopak Sinha and M. Asif Naeem", title = "{Witan}: unsupervised labelling function generation for assisted data programming", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2334--2347", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551797", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551797", abstract = "Effective supervised training of modern machine learning models often requires large labelled training datasets, which could be prohibitively costly to acquire for many practical applications. Research addressing this problem has sought ways to leverage \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bao:2022:SMM, author = "Ergute Bao and Yizheng Zhu and Xiaokui Xiao and Yin Yang and Beng Chin Ooi and Benjamin Hong Meng Tan and Khin Mi Mi Aung", title = "{Skellam} mixture mechanism: a novel approach to federated learning with differential privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2348--2360", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551798", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551798", abstract = "Deep neural networks have strong capabilities of memorizing the underlying training data, which can be a serious privacy concern. An effective solution to this problem is to train models with differential privacy ( DP ), which provides rigorous privacy \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hilprecht:2022:ZSC, author = "Benjamin Hilprecht and Carsten Binnig", title = "Zero-shot cost models for out-of-the-box learned cost prediction", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2361--2374", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551799", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551799", abstract = "In this paper, we introduce zero-shot cost models, which enable learned cost estimation that generalizes to unseen databases. In contrast to state-of-the-art workload-driven approaches, which require to execute a large set of training queries on every \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Choi:2022:WMG, author = "Dalsu Choi and Hyunsik Yoon and Hyubjin Lee and Yon Dohn Chung", title = "{Waffle}: in-memory grid index for moving objects with reinforcement learning-based configuration tuning system", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2375--2388", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551800", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551800", abstract = "Location-based services for moving objects are close to our lives. For example, ride-sharing services, micro-mobility services, navigation and traffic management, delivery services, and autonomous driving are all based on moving objects. The efficient \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jungmair:2022:DOF, author = "Michael Jungmair and Andr{\'e} Kohn and Jana Giceva", title = "Designing an open framework for query optimization and compilation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2389--2401", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551801", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551801", abstract = "Since its invention, data-centric code generation has been adopted for query compilation by various database systems in academia and industry. These database systems are fast but maximize performance at the expense of developer friendliness, flexibility,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Nguyen:2022:PST, author = "Lam-Duy Nguyen and Sang-Won Lee and Beomseok Nam", title = "In-page shadowing and two-version timestamp ordering for mobile {DBMSs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2402--2414", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551802", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551802", abstract = "Increasing the concurrency level in mobile database systems has not received much attention, mainly because the concurrency requirements of mobile workloads has been regarded to be low. Contrary to popular belief, mobile workloads require higher \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2022:REA, author = "Shixuan Sun and Xibo Sun and Bingsheng He and Qiong Luo", title = "{RapidFlow}: an efficient approach to continuous subgraph matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2415--2427", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551803", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551803", abstract = "Continuous subgraph matching (CSM) is an important building block in many real-time graph processing applications. Given a subgraph query Q and a data graph stream, a CSM algorithm reports the occurrences of Q in the stream. Specifically, when a new \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Helali:2022:SAA, author = "Mossad Helali and Essam Mansour and Ibrahim Abdelaziz and Julian Dolby and Kavitha Srinivas", title = "A scalable {AutoML} approach based on graph neural networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2428--2436", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551804", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551804", abstract = "AutoML systems build machine learning models automatically by performing a search over valid data transformations and learners, along with hyper-parameter optimization for each learner. Many AutoML systems use meta-learning to guide search for optimal \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pappachan:2022:DTT, author = "Primal Pappachan and Shufan Zhang and Xi He and Sharad Mehrotra", title = "Don't be a tattle-tale: preventing leakages through data dependencies on access control protected data", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2437--2449", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551805", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551805", abstract = "We study the problem of answering queries when (part of) the data may be sensitive and should not be leaked to the querier. Simply restricting the computation to non-sensitive part of the data may leak sensitive data through inference based on data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xu:2022:ELB, author = "Qingyu Xu and Feng Zhang and Zhiming Yao and Lv Lu and Xiaoyong Du and Dong Deng and Bingsheng He", title = "Efficient load-balanced butterfly counting on {GPU}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2450--2462", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551806", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551806", abstract = "Butterfly counting is an important and costly operation for large bipartite graphs. GPUs are popular parallel heterogeneous devices and can bring significant performance improvement for data science applications. Unfortunately, no work enables efficient \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Benson:2022:PBB, author = "Lawrence Benson and Leon Papke and Tilmann Rabl", title = "{PerMA}-bench: benchmarking persistent memory access", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2463--2476", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551807", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551807", abstract = "Persistent memory's (PMem) byte-addressability and persistence at DRAM-like speed with SSD-like capacity have the potential to cause a major performance shift in database storage systems. With the availability of Intel Optane DC Persistent Memory, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{He:2022:EPM, author = "Yuliang He and Duo Lu and Kaisong Huang and Tianzheng Wang", title = "Evaluating persistent memory range indexes: part two", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2477--2490", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551808", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551808", abstract = "Scalable persistent memory (PM) has opened up new opportunities for building indexes that operate and persist data directly on the memory bus, potentially enabling instant recovery, low latency and high throughput. When real PM hardware (Intel Optane \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yogatama:2022:ODP, author = "Bobbi W. Yogatama and Weiwei Gong and Xiangyao Yu", title = "Orchestrating data placement and query execution in heterogeneous {CPU-GPU DBMS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2491--2503", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551809", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551809", abstract = "There has been a growing interest in using GPU to accelerate data analytics due to its massive parallelism and high memory bandwidth. The main constraint of using GPU for data analytics is the limited capacity of GPU memory. Heterogeneous CPU-GPU query \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:IMO, author = "Weicheng Wang and Raymond Chi-Wing Wong", title = "Interactive mining with ordered and unordered attributes", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2504--2516", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551810", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551810", abstract = "There are various queries proposed to assist users in finding their favorite tuples from a dataset with the help of user interaction. Specifically, they interact with a user by asking questions. Each question presents two tuples, which are selected from \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2022:FDS, author = "Wenzhe Yang and Sheng Wang and Yuan Sun and Zhiyong Peng", title = "Fast dataset search with earth mover's distance", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2517--2529", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551811", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551811", abstract = "The amount of spatial data in open data portals has increased rapidly, raising the demand for spatial dataset search in large data repositories. In this paper, we tackle spatial dataset search by using the Earth Mover's Distance (EMD) to measure the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pereira:2022:AST, author = "Jo{\~a}o L. M. Pereira and Jo{\~a}o Casanova and Helena Galhardas and Dennis Shasha", title = "{AcX}: system, techniques, and experiments for acronym expansion", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2530--2544", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551812", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551812", abstract = "In this information-accumulating world, each of us must learn continuously. To participate in a new field, or even a sub-field, one must be aware of the terminology including the acronyms that specialists know so well, but newcomers do not. Building on \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2022:GTH, author = "Hongzhi Chen and Changji Li and Chenguang Zheng and Chenghuan Huang and Juncheng Fang and James Cheng and Jian Zhang", title = "{G-tran}: a high performance distributed graph database with a decentralized architecture", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2545--2558", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551813", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551813", abstract = "Graph transaction processing poses unique challenges such as random data access due to the irregularity of graph structures, low throughput and high abort rate due to the relatively large read/write sets in graph transactions. To address these \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Konig:2022:TPS, author = "Arnd Christian K{\"o}nig and Yi Shan and Tobias Ziegler and Aarati Kakaraparthy and Willis Lang and Justin Moeller and Ajay Kalhan and Vivek Narasayya", title = "Tenant placement in over-subscribed database-as-a-service clusters", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2559--2571", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551814", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551814", abstract = "Relational cloud Database-as-a-Service offerings run on multi-tenant infrastructure consisting of clusters of nodes, with each node hosting multiple tenant databases. Such clusters may be over-subscribed to increase resource utilization and improve \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2022:EBS, author = "Yue Chen and Kaiyu Feng and Gao Cong and Han Mao Kiah", title = "Example-based spatial pattern matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2572--2584", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551815", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551815", abstract = "The prevalence of GPS-enabled mobile devices and location-based services yield massive volume of spatial objects where each object contains information including geographical location, name, address, category and other attributes. This paper introduces \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Peng:2022:NFP, author = "Zeshun Peng and Yanfeng Zhang and Qian Xu and Haixu Liu and Yuxiao Gao and Xiaohua Li and Ge Yu", title = "{NeuChain}: a fast permissioned blockchain system with deterministic ordering", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2585--2598", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551816", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551816", abstract = "Blockchain serves as a replicated transactional processing system in a trustless distributed environment. Existing blockchain systems all rely on an explicit ordering step to determine the global order of transactions that are collected from multiple \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{McKenna:2022:AAI, author = "Ryan McKenna and Brett Mullins and Daniel Sheldon and Gerome Miklau", title = "{AIM}: an adaptive and iterative mechanism for differentially private synthetic data", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2599--2612", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551817", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551817", abstract = "We propose AIM, a new algorithm for differentially private synthetic data generation. AIM is a workload-adaptive algorithm within the paradigm of algorithms that first selects a set of queries, then privately measures those queries, and finally \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Toussaint:2022:TNV, author = "Etienne Toussaint and Paolo Guagliardo and Leonid Libkin and Juan Sequeda", title = "Troubles with nulls, views from the users", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2613--2625", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551818", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551818", abstract = "Incomplete data, in the form of null values, has been extensively studied since the inception of the relational model in the 1970s. Anecdotally, one hears that the way in which SQL, the standard language for relational databases, handles nulls creates a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Park:2022:GSE, author = "Yeonhong Park and Sunhong Min and Jae W. Lee", title = "{Ginex}: {SSD}-enabled billion-scale graph neural network training on a single machine via provably optimal in-memory caching", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2626--2639", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551819", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551819", abstract = "Graph Neural Networks (GNNs) are receiving a spotlight as a powerful tool that can effectively serve various inference tasks on graph structured data. As the size of real-world graphs continues to scale, the GNN training system faces a scalability \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2022:SPQ, author = "Junhua Zhang and Wentao Li and Long Yuan and Lu Qin and Ying Zhang and Lijun Chang", title = "Shortest-path queries on complex networks: experiments, analyses, and improvement", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2640--2652", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551820", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551820", abstract = "The shortest-path query, which returns the shortest path between two vertices, is a basic operation on complex networks and has numerous applications. To handle shortest-path queries, one option is to use traversal-based methods (e.g., breadth-first \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ghayyur:2022:MAA, author = "Sameera Ghayyur and Dhrubajyoti Ghosh and Xi He and Sharad Mehrotra", title = "{MIDE}: accuracy aware minimally invasive data exploration for decision support", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2653--2665", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551821", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551821", abstract = "This paper studies privacy in the context of decision-support queries that classify objects as either true or false based on whether they satisfy the query. Mechanisms to ensure privacy may result in false positives and false negatives. In decision-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ghosh:2022:JJT, author = "Dhrubajyoti Ghosh and Peeyush Gupta and Sharad Mehrotra and Roberto Yus and Yasser Altowim", title = "{JENNER}: just-in-time enrichment in query processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2666--2678", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551822", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551822", abstract = "Emerging domains, such as sensor-driven smart spaces and social media analytics, require incoming data to be enriched prior to its use. Enrichment often consists of machine learning (ML) functions that are too expensive/infeasible to execute at \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2022:CCA, author = "Jiaoyi Zhang and Yihan Gao", title = "{CARMI}: a cache-aware learned index with a cost-based construction algorithm", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2679--2691", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551823", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Oct 29 08:52:37 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3551793.3551823", abstract = "Learned indexes, which use machine learning models to replace traditional index structures, have shown promising results in recent studies. However, existing learned indexes exhibit a performance gap between synthetic and real-world datasets, making \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Swift:2022:MFC, author = "Ian P. Swift and Sana Ebrahimi and Azade Nova and Abolfazl Asudeh", title = "Maximizing fair content spread via edge suggestion in social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2692--2705", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551824", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Content spread inequity is a potential unfairness issue in online social networks, disparately impacting minority groups. In this paper, we view friendship suggestion, a common feature in social network platforms, as an opportunity to achieve an \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hertzschuch:2022:TCS, author = "Axel Hertzschuch and Claudio Hartmann and Dirk Habich and Wolfgang Lehner", title = "Turbo-charging {SPJ} query plans with learned physical join operator selections", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2706--2718", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551825", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The optimization of select-project-join (SPJ) queries entails two major challenges: (i) finding a good join order and (ii) selecting the best-fitting physical join operator for each single join within the chosen join order. Previous work mainly focuses \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ma:2022:FLD, author = "Chenhao Ma and Reynold Cheng and Laks V. S. Lakshmanan and Xiaolin Han", title = "Finding locally densest subgraphs: a convex programming approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2719--2732", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551826", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Finding the densest subgraph (DS) from a graph is a fundamental problem in graph databases. The DS obtained, which reveals closely related entities, has been found to be useful in various application domains such as e-commerce, social science, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shao:2022:DDS, author = "Zezhi Shao and Zhao Zhang and Wei Wei and Fei Wang and Yongjun Xu and Xin Cao and Christian S. Jensen", title = "Decoupled dynamic spatial-temporal graph neural network for traffic forecasting", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2733--2746", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551827", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We all depend on mobility, and vehicular transportation affects the daily lives of most of us. Thus, the ability to forecast the state of traffic in a road network is an important functionality and a challenging task. Traffic data is often obtained from \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2022:HOH, author = "Youjie Li and Amar Phanishayee and Derek Murray and Jakub Tarnawski and Nam Sung Kim", title = "{Harmony}: overcoming the hurdles of {GPU} memory capacity to train massive {DNN} models on commodity servers", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2747--2760", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551828", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Deep neural networks (DNNs) have grown exponentially in size over the past decade, leaving only those who have massive datacenter-based resources with the ability to develop and train such models. One of the main challenges for the long tail of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Luo:2022:SVD, author = "Xuan Luo and Jian Pei and Zicun Cong and Cheng Xu", title = "On {Shapley} value in data assemblage under independent utility", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2761--2773", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551829", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In many applications, an organization may want to acquire data from many data owners. Data marketplaces allow data owners to produce data assemblage needed by data buyers through coalition. To encourage coalitions to produce data, it is critical to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Paparrizos:2022:VUS, author = "John Paparrizos and Paul Boniol and Themis Palpanas and Ruey S. Tsay and Aaron Elmore and Michael J. Franklin", title = "Volume under the surface: a new accuracy evaluation measure for time-series anomaly detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2774--2787", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551830", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Anomaly detection (AD) is a fundamental task for time-series analytics with important implications for the downstream performance of many applications. In contrast to other domains where AD mainly focuses on point-based anomalies (i.e., outliers in \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yin:2022:ASC, author = "Haoteng Yin and Muhan Zhang and Yanbang Wang and Jianguo Wang and Pan Li", title = "Algorithm and system co-design for efficient subgraph-based graph representation learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2788--2796", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551831", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Subgraph-based graph representation learning (SGRL) has been recently proposed to deal with some fundamental challenges encountered by canonical graph neural networks (GNNs), and has demonstrated advantages in many important data science applications \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Freitag:2022:MOM, author = "Michael Freitag and Alfons Kemper and Thomas Neumann", title = "Memory-optimized multi-version concurrency control for disk-based database systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2797--2810", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551832", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Pure in-memory database systems offer outstanding performance but degrade heavily if the working set does not fit into DRAM, which is problematic in view of declining main memory growth rates. In contrast, recently proposed memory-optimized disk-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{He:2022:QPT, author = "Dong He and Supun C. Nakandala and Dalitso Banda and Rathijit Sen and Karla Saur and Kwanghyun Park and Carlo Curino and Jes{\'u}s Camacho-Rodr{\'\i}guez and Konstantinos Karanasos and Matteo Interlandi", title = "Query processing on tensor computation runtimes", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2811--2825", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551833", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The huge demand for computation in artificial intelligence (AI) is driving unparalleled investments in hardware and software systems for AI. This leads to an explosion in the number of specialized hardware devices, which are now offered by major cloud \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tang:2022:RCS, author = "Yifu Tang and Jianxin Li and Nur Al Hasan Haldar and Ziyu Guan and Jiajie Xu and Chengfei Liu", title = "Reliable community search in dynamic networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2826--2838", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551834", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Searching for local communities is an important research problem that supports advanced data analysis in various complex networks, such as social networks, collaboration networks, cellular networks, etc. The evolution of such networks over time has \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Amiri:2022:QSM, author = "Mohammad Javad Amiri and Boon Thau Loo and Divyakant Agrawal and Amr {El Abbadi}", title = "{Qanaat}: a scalable multi-enterprise permissioned blockchain system with confidentiality guarantees", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2839--2852", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551835", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Today's large-scale data management systems need to address distributed applications' confidentiality and scalability requirements among a set of collaborative enterprises. This paper presents Qanaat, a scalable multi-enterprise permissioned blockchain \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chan:2022:FNF, author = "Tsz Nam Chan and Leong Hou U. and Yun Peng and Byron Choi and Jianliang Xu", title = "Fast network $k$-function-based spatial analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2853--2866", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551836", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Network K -function has been the de facto operation for analyzing point patterns in spatial networks, which is widely used in many communities, including geography, ecology, transportation science, social science, and criminology. To analyze a location \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lasch:2022:CMO, author = "Robert Lasch and Thomas Legler and Norman May and Bernhard Scheirle and Kai-Uwe Sattler", title = "Cost modelling for optimal data placement in heterogeneous main memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2867--2880", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551837", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The cost of DRAM contributes significantly to the operating costs of in-memory database management systems (IMDBMS). Persistent memory (PMEM) is an alternative type of byte-addressable memory that offers --- in addition to persistence --- higher \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2022:SSN, author = "Junru Li and Youyou Lu and Yiming Zhang and Qing Wang and Zhuo Cheng and Keji Huang and Jiwu Shu", title = "{SwitchTx}: scalable in-network coordination for distributed transaction processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2881--2894", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551838", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Online-transaction-processing (OLTP) applications require the underlying storage system to guarantee consistency and serializability for distributed transactions involving large numbers of servers, which tends to introduce high coordination cost and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Vogel:2022:PWO, author = "Lukas Vogel and Alexander van Renen and Satoshi Imamura and Jana Giceva and Thomas Neumann and Alfons Kemper", title = "{Plush}: a write-optimized persistent log-structured hash-table", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2895--2907", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551839", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Persistent memory (PMem) promised DRAM-like performance, byte addressability, and the persistency guarantees of conventional block storage. With the release of Intel Optane DCPMM, those expectations were dampened. While its write latency competes with \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2022:EID, author = "Fangyuan Zhang and Sibo Wang", title = "Effective indexing for dynamic structural graph clustering", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2908--2920", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551840", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph clustering is a fundamental data mining task that clusters vertices into different groups. The structural graph clustering algorithm ( SCAN ) is a widely used graph clustering algorithm that derives not only clustering results, but also special \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Trummer:2022:CSC, author = "Immanuel Trummer", title = "{CodexDB}: synthesizing code for query processing from natural language instructions using {GPT-3} codex", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2921--2928", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551841", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "CodexDB enables users to customize SQL query processing via natural language instructions. CodexDB is based on OpenAI's GPT-3 Codex model which translates text into code. It is a framework on top of GPT-3 Codex that decomposes complex SQL queries into a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Phani:2022:UPS, author = "Arnab Phani and Lukas Erlbacher and Matthias Boehm", title = "{UPLIFT}: parallelization strategies for feature transformations in machine learning workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2929--2938", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551842", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data science pipelines are typically exploratory. An integral task of such pipelines are feature transformations, which transform raw data into numerical matrices or tensors for training or scoring. There exist a wide variety of transformations for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2022:LSM, author = "Xinjing Zhou and Xiangyao Yu and Goetz Graefe and Michael Stonebraker", title = "{Lotus}: scalable multi-partition transactions on single-threaded partitioned databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2939--2952", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551843", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper revisits the H-Store/VoltDB concurrency control scheme for partitioned main-memory databases, which we term run-to-completion-single-thread (RCST), with an eye toward improving its poor performance on multi-partition (MP) workloads. The \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kanellis:2022:LSE, author = "Konstantinos Kanellis and Cong Ding and Brian Kroth and Andreas M{\"u}ller and Carlo Curino and Shivaram Venkataraman", title = "{LlamaTune}: sample-efficient {DBMS} configuration tuning", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2953--2965", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551844", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Tuning a database system to achieve optimal performance on a given workload is a long-standing problem in the database community. A number of recent works have leveraged ML-based approaches to guide the sampling of large parameter spaces (hundreds of \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Winter:2022:DSS, author = "Christian Winter and Jana Giceva and Thomas Neumann and Alfons Kemper", title = "On-demand state separation for cloud data warehousing", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2966--2979", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551845", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Moving data analysis and processing to the cloud is no longer reserved for a few companies with petabytes of data. Instead, the flexibility of on-demand resources is attracting an increasing number of customers with small to medium-sized workloads. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Trummer:2022:BBB, author = "Immanuel Trummer", title = "{BABOONS}: black-box optimization of data summaries in natural language", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2980--2993", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551846", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "BABOONS (BlAck BOx Optimization of Natural language data Summaries) optimizes text data summaries for an arbitrary, user-defined utility function. Primarily, it targets scenarios in which utility is evaluated via large language models. Users describe \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:CAD, author = "Xiaoying Wang and Weiyuan Wu and Jinze Wu and Yizhou Chen and Nick Zrymiak and Changbo Qu and Lampros Flokas and George Chow and Jiannan Wang and Tianzheng Wang and Eugene Wu and Qingqing Zhou", title = "{ConnectorX}: accelerating data loading from databases to dataframes", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "2994--3003", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551847", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data is often stored in a database management system (DBMS) but dataframe libraries are widely used among data scientists. An important but challenging problem is how to bridge the gap between databases and dataframes. To solve this problem, we present \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wongkham:2022:ULI, author = "Chaichon Wongkham and Baotong Lu and Chris Liu and Zhicong Zhong and Eric Lo and Tianzheng Wang", title = "Are updatable learned indexes ready?", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3004--3017", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551848", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently, numerous promising results have shown that updatable learned indexes can perform better than traditional indexes with much lower memory space consumption. But it is unknown how these learned indexes compare against each other and against the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Reif:2022:SGA, author = "Maximilian Reif and Thomas Neumann", title = "A scalable and generic approach to range joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3018--3030", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551849", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Analytical database systems provide great insights into large datasets and are an excellent tool for data exploration and analysis. A central pillar of query processing is the efficient evaluation of equi-joins, typically with linear-time algorithms {ldots}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hohma:2022:SSC, author = "Ellen Hohma and Christian M. M. Frey and Anna Beer and Thomas Seidl", title = "{SCAR}: spectral clustering accelerated and robustified", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3031--3044", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551850", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Spectral clustering is one of the most advantageous clustering approaches. However, standard Spectral Clustering is sensitive to noisy input data and has a high runtime complexity. Tackling one of these problems often exacerbates the other. As real-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Benedikt:2022:RIC, author = "Michael Benedikt and Maxime Buron and Stefano Germano and Kevin Kappelmann and Boris Motik", title = "Rewriting the infinite chase", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3045--3057", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551851", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Guarded tuple-generating dependencies (GTGDs) are a natural extension of description logics and referential constraints. It has long been known that queries over GTGDs can be answered by a variant of the chase ---a quintessential technique for reasoning \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liakos:2022:CEL, author = "Panagiotis Liakos and Katia Papakonstantinopoulou and Yannis Kotidis", title = "{Chimp}: efficient lossless floating point compression for time series databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3058--3070", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551852", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Applications in diverse domains such as astronomy, economics and industrial monitoring, increasingly press the need for analyzing massive collections of time series data. The sheer size of the latter hinders our ability to efficiently store them and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dayan:2022:SGL, author = "Niv Dayan and Tamar Weiss and Shmuel Dashevsky and Michael Pan and Edward Bortnikov and Moshe Twitto", title = "{Spooky}: granulating {LSM}-tree compactions correctly", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3071--3084", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551853", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:20:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern storage engines and key-value stores have come to rely on the log-structured merge-tree (LSM-tree) as their core data structure. LSM-tree operates by gradually merge-sorting data across levels of exponentially increasing capacities in storage. A \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yao:2022:ISB, author = "Kai Yao and Lijun Chang and Jeffrey Xu Yu", title = "Identifying similar-bicliques in bipartite graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3085--3097", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551854", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:24:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Bipartite graphs have been widely used to model the relationship between entities of different types, where vertices are partitioned into two disjoint sets/sides. Finding dense subgraphs in a bipartite graph is of great significance and encompasses many \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lyu:2022:FGM, author = "Chenghao Lyu and Qi Fan and Fei Song and Arnab Sinha and Yanlei Diao and Wei Chen and Li Ma and Yihui Feng and Yaliang Li and Kai Zeng and Jingren Zhou", title = "Fine-grained modeling and optimization for intelligent resource management in big data processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3098--3111", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551855", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:24:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Big data processing at the production scale presents a highly complex environment for resource optimization (RO), a problem crucial for meeting performance goals and budgetary constraints of analytical users. The RO problem is challenging because it \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2022:FCM, author = "Ziyi Liu and Lei Li and Mengxuan Zhang and Wen Hua and Xiaofang Zhou", title = "{FHL}-cube: multi-constraint shortest path querying with flexible combination of constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3112--3125", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551856", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:24:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Multi-Constraint Shortest Path ( MCSP ) generalizes the classic shortest path from single to multiple criteria such that more personalized needs can be satisfied. However, MCSP query is essentially a high-dimensional skyline problem and thus time-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Abebe:2022:TEP, author = "Michael Abebe and Horatiu Lazu and Khuzaima Daudjee", title = "{Tiresias}: enabling predictive autonomous storage and indexing", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3126--3136", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551857", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:24:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To efficiently store and query a DBMS, administrators must select storage and indexing configurations. For example, one must decide whether data should be stored in rows or columns, in-memory or on disk, and which columns to index. These choices can be \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Asudeh:2022:TDA, author = "Abolfazl Asudeh and Fatemeh Nargesian", title = "Towards distribution-aware query answering in data markets", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3137--3144", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551858", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:24:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Addressing the increasing demand for data exchange has led to the development of data markets that facilitate transactional interactions between data buyers and data sellers. Still, cost-effective and distribution-aware query answering is a substantial \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kwon:2022:CEA, author = "Suyong Kwon and Woohwan Jung and Kyuseok Shim", title = "Cardinality estimation of approximate substring queries using deep learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3145--3157", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551859", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:24:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cardinality estimation of an approximate substring query is an important problem in database systems. Traditional approaches build a summary from the text data and estimate the cardinality using the summary with some statistical assumptions. Since deep \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Saur:2022:CEU, author = "Karla Saur and Tara Mirmira and Konstantinos Karanasos and Jes{\'u}s Camacho-Rodr{\'\i}guez", title = "Containerized execution of {UDFs}: an experimental evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3158--3171", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551860", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:24:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "User-defined functions (UDFs) have long been used as the de facto way to extend the capabilities of data management systems. However, they are restricted to the specificities of each DBMS, and recent demands for advanced analytics have increased the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xia:2022:DSD, author = "Siyuan Xia and Zhiru Zhu and Chris Zhu and Jinjin Zhao and Kyle Chard and Aaron J. Elmore and Ian Foster and Michael Franklin and Sanjay Krishnan and Raul Castro Fernandez", title = "Data station: delegated, trustworthy, and auditable computation to enable data-sharing consortia with a data escrow", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3172--3185", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551861", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:24:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Pooling and sharing data increases and distributes its value. But since data cannot be revoked once shared, scenarios that require controlled release of data for regulatory, privacy, and legal reasons default to not sharing. Because selectively \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ammar:2022:ODM, author = "Khaled Ammar and Siddhartha Sahu and Semih Salihoglu and M. Tamer {\"O}zsu", title = "Optimizing differentially-maintained recursive queries on dynamic graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3186--3198", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551862", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:24:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Differential computation (DC) is a highly general incremental computation/view maintenance technique that can maintain the output of an arbitrary and possibly recursive dataflow computation upon changes to its base inputs. As such, it is a promising \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Luo:2022:DTR, author = "Zihan Luo and Lei Li and Mengxuan Zhang and Wen Hua and Yehong Xu and Xiaofang Zhou", title = "Diversified top-$k$ route planning in road network", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3199--3212", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551863", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:24:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Route planning is ubiquitous and has a profound impact on our daily life. However, the existing path algorithms tend to produce similar paths between similar OD (Origin-Destination) pairs because they optimize query results without considering their \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2022:MSE, author = "Xiangmin Zhou and Lei Chen", title = "Migrating social event recommendation over microblogs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3213--3225", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551864", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:24:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Real applications like crisis management require the real time awareness of critical situations. However, the services using traditional methods like phone calls can be easily delayed due to busy lines, transfer delays or limited communication ability \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2022:STC, author = "Yueting Chen and Nick Koudas and Xiaohui Yu and Ziqiang Yu", title = "Spatial and temporal constrained ranked retrieval over videos", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3226--3239", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551865", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:24:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent advances in Computer Vision (CV) algorithms have improved accuracy and efficiency, making video annotations possible with high accuracy. In this paper, we utilize the annotated data provided by such algorithms and construct graph representations \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liao:2022:SSG, author = "Ningyi Liao and Dingheng Mo and Siqiang Luo and Xiang Li and Pengcheng Yin", title = "{SCARA}: scalable graph neural networks with feature-oriented optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3240--3248", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551866", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:24:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent advances in data processing have stimulated the demand for learning graphs of very large scales. Graph Neural Networks (GNNs), being an emerging and powerful approach in solving graph learning tasks, are known to be difficult to scale up. Most \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Manousis:2022:EEG, author = "Antonis Manousis and Zhuo Cheng and Ran {Ben Basat} and Zaoxing Liu and Vyas Sekar", title = "Enabling efficient and general subpopulation analytics in multidimensional data streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3249--3262", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551867", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:24:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Today's large-scale services ( e.g., video streaming platforms, data centers, sensor grids) need diverse real-time summary statistics across multiple subpopulations of multidimensional datasets. However, state-of-the-art frameworks do not offer general \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2022:DST, author = "Qing Chen and Oded Lachish and Sven Helmer and Michael H. B{\"o}hlen", title = "Dynamic spanning trees for connectivity queries on fully-dynamic undirected graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "11", pages = "3263--3276", month = jul, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3551793.3551868", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:24:33 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Answering connectivity queries is fundamental to fully dynamic graphs where edges and vertices are inserted and deleted frequently. Existing work proposes data structures and algorithms with worst case guarantees. We propose a new data structure, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chiosa:2022:HAC, author = "Monica Chiosa and Fabio Maschi and Ingo M{\"u}ller and Gustavo Alonso and Norman May", title = "Hardware acceleration of compression and encryption in {SAP HANA}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3277--3291", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554822", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554822", abstract = "With the advent of cloud computing, where computational resources are expensive and data movement needs to be secured and minimized, database management systems need to reconsider their architecture to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Graf:2022:FPB, author = "Martin Graf and Lukas Laskowski and Florian Papsdorf and Florian Sold and Roland Gremmelspacher and Felix Naumann and Fabian Panse", title = "{Frost}: a platform for benchmarking and exploring data matching results", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3292--3305", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554823", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554823", abstract = "``Bad'' data has a direct impact on 88\% of companies, with the average company losing 12\% of its revenue due to it. Duplicates --- multiple but different representations of the same real-world entities --- are among the main \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2022:BHP, author = "Changji Li and Hongzhi Chen and Shuai Zhang and Yingqian Hu and Chao Chen and Zhenjie Zhang and Meng Li and Xiangchen Li and Dongqing Han and Xiaohui Chen and Xudong Wang and Huiming Zhu and Xuwei Fu and Tingwei Wu and Hongfei Tan and Hengtian Ding and Mengjin Liu and Kangcheng Wang and Ting Ye and Lei Li and Xin Li and Yu Wang and Chenguang Zheng and Hao Yang and James Cheng", title = "{ByteGraph}: a high-performance distributed graph database in {ByteDance}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3306--3318", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554824", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554824", abstract = "Most products at ByteDance, e.g., TikTok, Douyin, and Toutiao, naturally generate massive amounts of graph data. To efficiently store, query and update massive graph data is challenging for the broad range of products at \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Das:2022:CEC, author = "Prakash Das and Shivangi Srivastava and Valentin Moskovich and Anmol Chaturvedi and Anant Mittal and Yongqin Xiao and Mosharaf Chowdhury", title = "{CDI-E}: an elastic cloud service for data engineering", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3319--3331", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554825", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554825", abstract = "We live in the gilded age of data-driven computing. With public clouds offering virtually unlimited amounts of compute and storage, enterprises collecting data about every aspect of their businesses, and advances in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:OED, author = "Sheng Wang and Yiran Li and Huorong Li and Feifei Li and Chengjin Tian and Le Su and Yanshan Zhang and Yubing Ma and Lie Yan and Yuanyuan Sun and Xuntao Cheng and Xiaolong Xie and Yu Zou", title = "{Operon}: an encrypted database for ownership-preserving data management", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3332--3345", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554826", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554826", abstract = "The past decade has witnessed the rapid development of cloud computing and data-centric applications. While these innovations offer numerous attractive features for data processing, they also bring in new issues about \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gong:2022:TPF, author = "Caixin Gong and Chengjin Tian and Zhengheng Wang and Sheng Wang and Xiyu Wang and Qiulei Fu and Wu Qin and Long Qian and Rui Chen and Jiang Qi and Ruo Wang and Guoyun Zhu and Chenghu Yang and Wei Zhang and Feifei Li", title = "{Tair-PMem}: a fully durable non-volatile memory database", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3346--3358", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554827", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554827", abstract = "In-memory databases (IMDBs) have been the backbone of modern systems that demand high throughput and low latency. Because of the cost and volatility of DRAM, IMDBs become incompetent when dealing with \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lambov:2022:TMC, author = "Branimir Lambov", title = "Trie memtables in {Cassandra}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3359--3371", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554828", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554828", abstract = "This paper discusses a new memtable implementation for Apache Cassandra which is based on tries (also called prefix trees) and byte-comparable representations of database keys. The implementation is already in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pedreira:2022:VMU, author = "Pedro Pedreira and Orri Erling and Masha Basmanova and Kevin Wilfong and Laith Sakka and Krishna Pai and Wei He and Biswapesh Chattopadhyay", title = "{Velox}: meta's unified execution engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3372--3384", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554829", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554829", abstract = "The ad-hoc development of new specialized computation engines targeted to very specific data workloads has created a siloed data landscape. Commonly, these engines share little to nothing with each other and are hard to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2022:OMT, author = "Zhenkun Yang and Chuanhui Yang and Fusheng Han and Mingqiang Zhuang and Bing Yang and Zhifeng Yang and Xiaojun Cheng and Yuzhong Zhao and Wenhui Shi and Huafeng Xi and Huang Yu and Bin Liu and Yi Pan and Boxue Yin and Junquan Chen and Quanqing Xu", title = "{OceanBase}: a 707 million {tpmC} distributed relational database system", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3385--3397", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554830", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554830", abstract = "We have designed and developed OceanBase, a distributed relational database system from the very basics for a decade. Being a scale-out multi-tenant system, OceanBase is cross-region fault tolerant, which is based on \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lan:2022:VVR, author = "Hai Lan and Jiong Xie and Zhifeng Bao and Feifei Li and Wei Tian and Fang Wang and Sheng Wang and Ailin Zhang", title = "{VRE}: a versatile, robust, and economical trajectory data system", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3398--3410", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554831", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554831", abstract = "Managing massive trajectory data from various moving objects has always been a demanding task. A desired trajectory data system should be versatile in its supported query types and distance functions, of low \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2022:BBH, author = "Jianjun Chen and Yonghua Ding and Ye Liu and Fangshi Li and Li Zhang and Mingyi Zhang and Kui Wei and Lixun Cao and Dan Zou and Yang Liu and Lei Zhang and Rui Shi and Wei Ding and Kai Wu and Shangyu Luo and Jason Sun and Yuming Liang", title = "{ByteHTAP}: {Bytedance}'s {HTAP} system with high data freshness and strong data consistency", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3411--3424", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554832", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554832", abstract = "In recent years, at ByteDance, we see more and more business scenarios that require performing complex analysis over freshly imported data, together with transaction support and strong data consistency. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wingerath:2022:BCW, author = "Wolfram Wingerath and Benjamin Wollmer and Markus Bestehorn and Stephan Succo and Sophie Ferrlein and Florian B{\"u}cklers and J{\"o}rn Domnik and Fabian Panse and Erik Witt and Anil Sener and Felix Gessert and Norbert Ritter", title = "{Beaconnect}: continuous web performance {A\slash B} testing at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3425--3431", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554833", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554833", abstract = "Content delivery networks (CDNs) are critical for minimizing access latency in the Web as they efficiently distribute online resources across the globe. But since CDNs can only be enabled on the scope of entire \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2022:COC, author = "Zongzhi Chen and Xinjun Yang and Feifei Li and Xuntao Cheng and Qingda Hu and Zheyu Miao and Rongbiao Xie and Xiaofei Wu and Kang Wang and Zhao Song and Haiqing Sun and Zechao Zhuang and Yuming Yang and Jie Xu and Liang Yin and Wenchao Zhou and Sheng Wang", title = "{CloudJump}: optimizing cloud databases for cloud storages", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3432--3444", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554834", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554834", abstract = "There has been an increasing interest in building cloud-native databases that decouple computation and storage for elasticity. A cloud-native database often adopts a cloud storage underneath its storage engine, leveraging \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zheng:2022:DMN, author = "Kaiping Zheng and Shaofeng Cai and Horng Ruey Chua and Melanie Herschel and Meihui Zhang and Beng Chin Ooi", title = "{DyHealth}: making neural networks dynamic for effective healthcare analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3445--3458", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554835", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554835", abstract = "In National University Hospital (NUH) in Singapore, we conduct healthcare analytics that analyzes heterogeneous electronic medical records (EMR) to support effective clinical decision-making on a daily basis. Existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mishchenko:2022:BCS, author = "Andrey Mishchenko and Dominique Danco and Abhilash Jindal and Adrian Blue", title = "{Blueprint}: a constraint-solving approach for document extraction", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3459--3471", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554836", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554836", abstract = "Blueprint is a declarative domain-specific language for document extraction. Users describe document layout using spatial, textual, semantic, and numerical fuzzy constraints, and the language runtime extracts the field-value \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yu:2022:TCL, author = "Muzhi Yu and Zhaoxiang Lin and Jinan Sun and Runyun Zhou and Guoqiang Jiang and Hua Huang and Shikun Zhang", title = "{TencentCLS}: the cloud log service with high query performances", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3472--3482", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554837", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554837", abstract = "With the trend of cloud computing, the cloud log service is becoming increasingly important, as it plays a critical role in tasks such as root cause analysis, service monitoring and security audition. To meet these needs, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xie:2022:GMD, author = "Jiong Xie and Zhen Chen and Jianwei Liu and Fang Wang and Feifei Li and Zhida Chen and Yinpei Liu and Songlu Cai and Zhenhua Fan and Fei Xiao and Yue Chen", title = "{Ganos}: a multidimensional, dynamic, and scene-oriented cloud-native spatial database engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3483--3495", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554838", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554838", abstract = "Recently, the trend of developing digital twins for smart cities has driven a need for managing large-scale multidimensional, dynamic, and scene-oriented spatial data. Due to larger data scale and more complex \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lakshman:2022:MHD, author = "Sarath Lakshman and Apaar Gupta and Rohan Suri and Scott Lashley and John Liang and Srinath Duvuru and Ravi Mayuram", title = "{Magma}: a high data density storage engine used in {Couchbase}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3496--3508", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554839", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554839", abstract = "We present Magma, a write-optimized high data density key-value storage engine used in the Couchbase NoSQL distributed document database. Today's write-heavy data-intensive applications like ad-serving, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cahoon:2022:DAS, author = "Joyce Cahoon and Wenjing Wang and Yiwen Zhu and Katherine Lin and Sean Liu and Raymond Truong and Neetu Singh and Chengcheng Wan and Alexandra Ciortea and Sreraman Narasimhan and Subru Krishnan", title = "{Doppler}: automated {SKU} recommendation in migrating {SQL} workloads to the cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3509--3521", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554840", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554840", abstract = "Selecting the optimal cloud target to migrate SQL estates from on-premises to the cloud remains a challenge. Current solutions are not only time-consuming and error-prone, requiring significant user input, but also fail to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Harizopoulos:2022:MNG, author = "Stavros Harizopoulos and Taylor Hopper and Morton Mo and Shyam Sundar Chandrasekaran and Tongguang Chen and Yan Cui and Nandini Ganesh and Gary Helmling and Hieu Pham and Sebastian Wong", title = "{Meta}'s next-generation realtime monitoring and analytics platform", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3522--3534", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554841", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554841", abstract = "Unlike traditional database systems where data and system availability are tied together, there is a wide class of systems targeting realtime monitoring and analytics over structured logs where these properties can be \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gaffney:2022:SPP, author = "Kevin P. Gaffney and Martin Prammer and Larry Brasfield and D. Richard Hipp and Dan Kennedy and Jignesh M. Patel", title = "{SQLite}: past, present, and future", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3535--3547", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554842", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554842", abstract = "In the two decades following its initial release, SQLite has become the most widely deployed database engine in existence. Today, SQLite is found in nearly every smartphone, computer, web browser, television, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Guo:2022:MCN, author = "Rentong Guo and Xiaofan Luan and Long Xiang and Xiao Yan and Xiaomeng Yi and Jigao Luo and Qianya Cheng and Weizhi Xu and Jiarui Luo and Frank Liu and Zhenshan Cao and Yanliang Qiao and Ting Wang and Bo Tang and Charles Xie", title = "{Manu}: a cloud native vector database management system", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3548--3561", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554843", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554843", abstract = "With the development of learning-based embedding models, embedding vectors are widely used for analyzing and searching unstructured data. As vector collections exceed billion-scale, fully managed and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Galhotra:2022:ARD, author = "Sainyam Galhotra and Udayan Khurana", title = "Automated relational data explanation using external semantic knowledge", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3562--3565", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554844", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554844", abstract = "In data science problems, understanding the data is a crucial first step. However, it can be challenging and time intensive for a data scientist who is not an expert in that domain. Several downstream tasks such as \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rossi:2022:KEF, author = "Andrea Rossi and Donatella Firmani and Paolo Merialdo and Tommaso Teofili", title = "{Kelpie}: an explainability framework for embedding-based link prediction models", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3566--3569", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554845", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554845", abstract = "The latest generations of Link Prediction (LP) models rely on embeddings to tackle incompleteness in Knowledge Graphs, achieving great performance at the cost of interpretability. Their opaqueness limits the trust \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2022:ODC, author = "Yin Lin and Brit Youngmann and Yuval Moskovitch and H. V. Jagadish and Tova Milo", title = "{OREO}: detection of cherry-picked generalizations", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3570--3573", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554846", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554846", abstract = "Data analytics often make sense of large data sets by generalization: aggregating from the detailed data to a more general context. Given a dataset, misleading generalizations can sometimes be drawn from a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kohn:2022:DWF, author = "Andr{\'e} Kohn and Dominik Moritz and Mark Raasveldt and Hannes M{\"u}hleisen and Thomas Neumann", title = "{DuckDB-wasm}: fast analytical processing for the web", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3574--3577", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554847", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554847", abstract = "We introduce DuckDB-Wasm, a WebAssembly version of the database system DuckDB, to provide fast analytical processing for the Web. DuckDB-Wasm evaluates SQL queries asynchronously in web workers, supports \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xi:2022:EHL, author = "Yihai Xi and Ning Wang and Xinyu Chen and Yiyi Zhang and Zilong Wang and Zhihong Xu and Yue Wang", title = "{EasyDR}: a human-in-the-loop error detection \& repair platform for holistic table cleaning", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3578--3581", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554848", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554848", abstract = "Many tables on the web suffer from multi-level and multi-type quality problems, but existing cleaning systems cannot provide a comprehensive quality improvement for them. Most of these systems are designed for solving a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pan:2022:HFD, author = "Xuchen Pan and Yongxin Tong and Chunbo Xue and Zimu Zhou and Junping Du and Yuxiang Zeng and Yexuan Shi and Xiaofei Zhang and Lei Chen and Yi Xu and Ke Xu and Weifeng Lv", title = "{Hu-fu}: a data federation system for secure spatial queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3582--3585", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554849", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554849", abstract = "The increasing concerns on data security limit the sharing of data distributedly stored at multiple data owners and impede the scale of spatial queries over big urban data. In response, data federation systems have emerged \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gassen:2022:DCS, author = "Marius Gassen and Benjamin H{\"a}ttasch and Benjamin Hilprecht and Nadja Geisler and Alexander Fraser and Carsten Binnig", title = "Demonstrating {CAT}: synthesizing data-aware conversational agents for transactional databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3586--3589", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554850", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554850", abstract = "Databases for OLTP are often the backbone for applications such as hotel room or cinema ticket booking applications. However, developing a conversational agent (i.e., a chatbot-like interface) to allow end-users to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Personnaz:2022:EGE, author = "Aur{\'e}lien Personnaz and Brit Youngmann and Sihem Amer-Yahia", title = "{EDA4SUM}: guided exploration of data summaries", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3590--3593", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554851", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554851", abstract = "We demonstrate EDA4Sum, a framework dedicated to generating guided multi-step data summarization pipelines for very large datasets. Data summarization is the process of producing interpretable and representative \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2022:CEQ, author = "Chenjie Li and Juseung Lee and Zhengjie Miao and Boris Glavic and Sudeepa Roy", title = "{CaJaDE}: explaining query results by augmenting provenance with context", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3594--3597", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554852", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554852", abstract = "In this work, we demonstrate CaJaDE (Context-Aware Join-Augmented Deep Explanations), a system that explains query results by augmenting provenance with contextual information from other related tables in the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Asada:2022:STT, author = "Yuki Asada and Victor Fu and Apurva Gandhi and Advitya Gemawat and Lihao Zhang and Dong He and Vivek Gupta and Ehi Nosakhare and Dalitso Banda and Rathijit Sen and Matteo Interlandi", title = "Share the tensor tea: how databases can leverage the machine learning ecosystem", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3598--3601", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554853", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554853", abstract = "We demonstrate Tensor Query Processor (TQP): a query processor that automatically compiles relational operators into tensor programs. By leveraging tensor runtimes such as PyTorch, TQP is able to: (1) integrate with ML \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tan:2022:MTV, author = "Jess Tan and Desmond Yeo and Rachael Neoh and Huey-Eng Chua and Sourav S Bhowmick", title = "{MOCHA}: a tool for visualizing impact of operator choices in query execution plans for database education", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3602--3605", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554854", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554854", abstract = "The database systems course is offered in many major universities. A key learning goal of learners taking such a course is to understand how sql queries are processed in an RDBMS in practice. To this end, comprehension of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chan:2022:LVK, author = "Tsz Nam Chan and Pak Lon Ip and Kaiyan Zhao and Leong Hou U and Byron Choi and Jianliang Xu", title = "{LIBKDV}: a versatile kernel density visualization library for geospatial analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3606--3609", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554855", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554855", abstract = "Kernel density visualization (KDV) has been widely used in many geospatial analysis tasks, including traffic accident hotspot detection, crime hotspot detection, and disease outbreak detection. Although KDV can be \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ajmani:2022:DMR, author = "Arul Ajmani and Aayush Shah and Alexander Shraer and Adam Storm and Rebecca Taft and Oliver Tan and Nathan VanBenschoten", title = "A demonstration of multi-region {CockroachDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3610--3613", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554856", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554856", abstract = "A database service is required to meet the consistency, performance, and availability goals of modern applications serving a global user-base. Configuring a database deployed across multiple regions such that it fulfills these \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chapman:2022:DAD, author = "Adriane Chapman and Luca Lauro and Paolo Missier and Riccardo Torlone", title = "{DPDS}: assisting data science with data provenance", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3614--3617", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554857", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554857", abstract = "Successful data-driven science requires a complex combination of data engineering pipelines and data modelling techniques. Robust and defensible results can only be achieved when each step in the pipeline \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dadvar:2022:PPO, author = "Vargha Dadvar and Lukasz Golab and Divesh Srivastava", title = "{POEM}: pattern-oriented explanations of {CNN} models", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3618--3621", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554858", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554858", abstract = "Deep learning models achieve state-of-the-art performance in many applications, but their prediction decisions are difficult to explain. Various solutions exist in the area of explainable AI, for example to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zalipynis:2022:WGA, author = "Ramon Antonio Rodriges Zalipynis and Nikita Terlych", title = "{WebArrayDB}: a geospatial array {DBMS} in your web browser", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3622--3625", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554859", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554859", abstract = "Geospatial array DBMSs operate on georeferenced N -d arrays. They provide storage engines, query parsers, and processing capabilities as their core functionality. Traditionally, those have been too heavy for a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lan:2022:ATA, author = "Hai Lan and Yuanjia Zhang and Zhifeng Bao and Yu Dong and Dongxu Huang and Liu Tang and Jian Zhang", title = "{AutoDI}: towards an automatic plan regression analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3626--3629", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554860", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554860", abstract = "Manual analysis on plan regression is both labor-intensive and inefficient for a large query plan and numerous queries. In this paper, we demonstrate AutoDI, an automatic detection and inference tool that has been \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Davidson:2022:PEA, author = "Susan B. Davidson and Shay Gershtein and Tova Milo and Slava Novgorodov and May Shoshan", title = "{PHOcus}: efficiently archiving photos", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3630--3633", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554861", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554861", abstract = "Our ability to collect data is rapidly outstripping our ability to effectively store and use it. Organizations are therefore facing tough decisions of what data to archive (or dispose of) to effectively meet their business goals. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2022:VTE, author = "Kai Huang and Qingqing Ye and Jing Zhao and Xi Zhao and Haibo Hu and Xiaofang Zhou", title = "{VINCENT}: towards efficient exploratory subgraph search in graph databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3634--3637", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554862", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554862", abstract = "Exploratory search is a search paradigm that plays a vital role in databases, data mining, and information retrieval to assist users to get familiar with the underlying databases. It supports iterative query formulation to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Drien:2022:AAP, author = "Osnat Drien and Matanya Freiman and Yael Amsterdamer", title = "{ActivePDB}: active probabilistic databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3638--3641", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554863", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554863", abstract = "We present a novel framework for uncertain data management, called ActivePDB. We are given a relational probabilistic database, where each tuple is correct with some probability; e.g., a database constructed \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Teofili:2022:CED, author = "Tommaso Teofili and Donatella Firmani and Nick Koudas and Paolo Merialdo and Divesh Srivastava", title = "{CERTEM}: explaining and debugging black-box entity resolution systems with {CERTA}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3642--3645", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554864", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554864", abstract = "Entity resolution (ER) aims at identifying record pairs that refer to the same real-world entity. Recent works have focused on deep learning (DL) techniques, to solve this problem. While such works have brought \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Aksoy:2022:SIS, author = "Ahmet Kerem Aksoy and Pavel Dushev and Eleni Tzirita Zacharatou and Holmer Hemsen and Marcela Charfuelan and Jorge-Arnulfo Quian{\'e}-Ruiz and Beg{\"u}m Demir and Volker Markl", title = "Satellite image search in {AgoraEO}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3646--3649", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554865", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554865", abstract = "The growing operational capability of global Earth Observation (EO) creates new opportunities for data-driven approaches to understand and protect our planet. However, the current use of EO archives is very restricted \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yan:2022:SDD, author = "Li Yan and Nerissa Xu and Guozhong Li and Sourav S Bhowmick and Byron Choi and Jianliang Xu", title = "{SENSOR}: data-driven construction of sketch-based visual query interfaces for time series data", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3650--3653", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554866", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554866", abstract = "Sketching is a common approach to visually query time series data. However, a recent study reported that sketching a pattern for querying is ``often ineffective on its own'' in practice due to lack of ``representative objects'' to facilitate \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bonifati:2022:DPG, author = "Angela Bonifati and Stefania Dumbrava and Emile Martinez and Fatemeh Ghasemi and Malo Jaffr{\'e} and Pac{\^o}me Luton and Thomas Pickles", title = "{DiscoPG}: property graph schema discovery and exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3654--3657", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554867", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554867", abstract = "Property graphs are becoming pervasive in a variety of graph processing applications using interconnected data. They allow to encode multi-labeled nodes and edges, as well as their properties, represented as key/value \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Maamar-Kouadri:2022:SQO, author = "Wissam Maamar-Kouadri and Salima Benbernou and Mourad Ouziri and Themis Palpanas and Iheb {Ben Amor}", title = "{SA-Q}: observing, evaluating, and enhancing the quality of the results of sentiment analysis tools", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3658--3661", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554868", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554868", abstract = "Sentiment analysis has received constant research attention due to its usefulness and importance in different applications. However, despite the research advances in this field, most current tools suffer in prediction \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Orogat:2022:SDA, author = "Abdelghny Orogat and Ahmed El-Roby", title = "{SmartBench}: demonstrating automatic generation of comprehensive benchmarks for question answering over knowledge graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3662--3665", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554869", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554869", abstract = "In recent years, a significant number of question answering (QA) systems that retrieve answers to natural language questions from knowledge graphs (KG) have been introduced. However, finding a benchmark that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tu:2022:DHE, author = "Jianhong Tu and Xiaoyue Han and Ju Fan and Nan Tang and Chengliang Chai and Guoliang Li and Xiaoyong Du", title = "{DADER}: hands-off entity resolution with domain adaptation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3666--3669", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554870", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554870", abstract = "Entity resolution (ER) is a core data integration problem that identifies pairs of data instances referring to the same real-world entities, and the state-of-the-art results of ER are achieved by deep learning (DL) based \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gale:2022:SWS, author = "James Gale and Max Seiden and Deepanshu Utkarsh and Jason Frantz and Rob Woollen and {\c{C}}a{\u{g}}atay Demiralp", title = "Sigma workbook: a spreadsheet for cloud data warehouses", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3670--3673", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554871", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554871", abstract = "Cloud data warehouses (CDWs) bring large-scale data and compute power closer to users in enterprises. However, existing tools for analyzing data in CDWs are either limited in ad-hoc transformations or difficult to use for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2022:RMC, author = "Zihao Chen and Zhizhen Xu and Baokun Han and Chen Xu and Weining Qian and Aoying Zhou", title = "{ReMac}: a matrix computation system with redundancy elimination", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3674--3677", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554872", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554872", abstract = "Distributed matrix computation solutions support query interfaces of linear algebra expressions, which often contain redundancy, i.e., common and loop-constant subexpressions. However, existing solutions fail \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wenig:2022:TBT, author = "Phillip Wenig and Sebastian Schmidl and Thorsten Papenbrock", title = "{TimeEval}: a benchmarking toolkit for time series anomaly detection algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3678--3681", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554873", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554873", abstract = "Detecting anomalous subsequences in time series is an important task in time series analytics because it serves the identification of special events, such as production faults, delivery bottlenecks, system defects, or heart \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lerner:2022:DAH, author = "Alberto Lerner and Matthias Jasny and Theo Jepsen and Carsten Binnig and Philippe Cudr{\'e}-Mauroux", title = "{DBMS} annihilator: a high-performance database workload generator in action", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3682--3685", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554874", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554874", abstract = "Modern DBMS engines can achieve unprecedented transaction processing speeds thanks to the invention of clever data structures, concurrency schemes, and improvements in CPU and memory subsystems. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liang:2022:FSF, author = "Zhiyu Liang and Hongzhi Wang", title = "{FedTSC}: a secure federated learning system for interpretable time series classification", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3686--3689", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554875", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554875", abstract = "We demonstrate FedTSC, a novel federated learning (FL) system for interpretable time series classification (TSC). FedTSC is an FL-based TSC solution that makes a great balance among security, interpretability, accuracy, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2022:AVA, author = "Qingshun Wu and Yafei Li and Huiling Li and Di Zhang and Guanglei Zhu", title = "{AMRAS}: a visual analysis system for spatial crowdsourcing", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3690--3693", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554876", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554876", abstract = "The wide adoption of GPS-enabled smart devices has greatly promoted spatial crowdsourcing, where the core issue is how to assign tasks to workers efficiently and with high quality. In this paper, we build a novel visual \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Al-Sayeh:2022:SCA, author = "Hani Al-Sayeh and Muhammad Attahir Jibril and Muhammad Waleed {Bin Saeed} and Kai-Uwe Sattler", title = "{SparkCAD}: caching anomalies detector for {Spark} applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3694--3697", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554877", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554877", abstract = "Developers of Apache Spark applications can accelerate their workloads by caching suitable intermediate results in memory and reusing them rather than recomputing them all over again every time they are needed. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{vLeeuwen:2022:AQP, author = "Wilco v. Leeuwen and Thomas Mulder and Bram van de Wall and George Fletcher and Nikolay Yakovets", title = "{AvantGraph} query processing engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3698--3701", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554878", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554878", abstract = "We demonstrate AvantGraph, a graph query processing engine developed by the Database group at TU Eindhoven. Designed for efficient processing of both subgraph matching and navigational graph queries, AvantGraph \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Boniol:2022:TNL, author = "Paul Boniol and John Paparrizos and Yuhao Kang and Themis Palpanas and Ruey S. Tsay and Aaron J. Elmore and Michael J. Franklin", title = "{Theseus}: navigating the labyrinth of time-series anomaly detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3702--3705", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554879", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554879", abstract = "The detection of anomalies in time series has gained ample academic and industrial attention, yet, no comprehensive benchmark exists to evaluate time-series anomaly detection methods. Therefore, there is no final \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hofmann:2022:DAS, author = "Dennis Hofmann and Peter VanNostrand and Huayi Zhang and Yizhou Yan and Lei Cao and Samuel Madden and Elke Rundensteiner", title = "A demonstration of {AutoOD}: a self-tuning anomaly detection system", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3706--3709", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554880", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554880", abstract = "Anomaly detection is a critical task in applications like preventing financial fraud, system malfunctions, and cybersecurity attacks. While previous research has offered a plethora of anomaly detection algorithms, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gakhar:2022:POA, author = "Sunny Gakhar and Joyce Cahoon and Wangchao Le and Xiangnan Li and Kaushik Ravichandran and Hiren Patel and Marc Friedman and Brandon Haynes and Shi Qiao and Alekh Jindal and Jyoti Leeka", title = "{Pipemizer}: an optimizer for analytics data pipelines", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3710--3713", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554881", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554881", abstract = "We demonstrate Pipemizer, an optimizer and recommender aimed at improving the performance of queries or jobs in pipelines. These job pipelines are ubiquitous in modern data analytics due to jobs reading output files written \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Redyuk:2022:DAA, author = "Sergey Redyuk and Zoi Kaoudi and Sebastian Schelter and Volker Markl", title = "{DORIAN} in action: assisted design of data science pipelines", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3714--3717", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554882", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554882", abstract = "Existing automated machine learning solutions and intelligent discovery assistants are popular tools that facilitate the end-user with the design of data science (DS) pipelines. However, they yield limited \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{He:2022:WDN, author = "Yuntian He and Yue Zhang and Saket Gurukar and Srinivasan Parthasarathy", title = "{WebMILE}: democratizing network representation learning at scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3718--3721", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554883", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554883", abstract = "In recent years, we have seen the success of network representation learning (NRL) methods in diverse domains ranging from computational chemistry to drug discovery and from social network analysis to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Geisler:2022:DQQ, author = "Nadja Geisler and Benjamin H{\"a}ttasch and Carsten Binnig", title = "Demonstrating quest: a query-driven framework to explain classification models on tabular data", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3722--3725", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554884", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554884", abstract = "Machine learning models are everywhere now; but only few of them are transparent in how they work. To remedy this, local explanations aim to show users how and why learned models produce a certain output for a given input \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ripberger:2022:IID, author = "Drew Ripberger and Yifan Gan and Xueyuan Ren and Spyros Blanas and Yang Wang", title = "{IsoBugView}: interactively debugging isolation bugs in database applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3726--3729", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554885", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554885", abstract = "Database applications frequently use weaker isolation levels, such as Read Committed, for better performance, which may lead to bugs that do not happen under Serializable. Although a number of works have \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Foufoulas:2022:YRU, author = "Yannis Foufoulas and Alkis Simitsis and Yannis Ioannidis", title = "{YeSQL}: rich user-defined functions without the overhead", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3730--3733", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554886", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554886", abstract = "The diversity and complexity of modern data management applications led to the extension of the relational paradigm with syntactic and semantic support for User-Defined Functions (UDFs). Although well-established in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2022:DAM, author = "Zhihui Yang and Yicong Huang and Zuozhi Wang and Feng Gao and Yao Lu and Chen Li and X. Sean Wang", title = "Demonstration of accelerating machine learning inference queries with correlative proxy models", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3734--3737", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554887", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554887", abstract = "We will demonstrate a prototype query-processing engine, which utilizes correlations among predicates to accelerate machine learning (ML) inference queries on unstructured data. Expensive operators such as feature \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2022:DCI, author = "Xiaozhen Liu and Zuozhi Wang and Shengquan Ni and Sadeem Alsudais and Yicong Huang and Avinash Kumar and Chen Li", title = "Demonstration of collaborative and interactive workflow-based data analytics in {Texera}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3738--3741", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554888", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554888", abstract = "Collaborative data analytics is becoming increasingly important due to the higher complexity of data science, more diverse skills from different disciplines, more common asynchronous schedules of team members, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zalipynis:2022:SAR, author = "Ramon Antonio Rodriges Zalipynis", title = "{SimDB} in action: road traffic simulations completely inside array {DBMS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3742--3745", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554889", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554889", abstract = "Array DBMSs operate on big N -d arrays. Cellular automata (CA) work on a discrete lattice of cells, essentially on N -d arrays. CA facilitate decision support as they realistically simulate complex phenomena including road \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Badaro:2022:TTD, author = "Gilbert Badaro and Paolo Papotti", title = "Transformers for tabular data representation: a tutorial on models and applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3746--3749", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554890", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554890", abstract = "In the last few years, the natural language processing community witnessed advances in neural representations of free texts with transformer-based language models (LMs). Given the importance of knowledge available in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kiehn:2022:PDM, author = "Felix Kiehn and Mareike Schmidt and Daniel Glake and Fabian Panse and Wolfram Wingerath and Benjamin Wollmer and Martin Poppinga and Norbert Ritter", title = "Polyglot data management: state of the art \& open challenges", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3750--3753", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554891", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554891", abstract = "Due to the increasing variety of the current database landscape, polyglot data management has become a hot research topic in recent years. The underlying idea is to combine the benefits of different data stores \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wasay:2022:MPT, author = "Abdul Wasay and Nesime Tatbul and Justin Gottschlich", title = "Machine programming: turning data into programmer productivity", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3754--3757", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554892", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554892", abstract = "Machine programming is an emerging research area that improves the software development life cycle from design through deployment. We present a tutorial on machine programming research highlighting aspects \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2022:CDN, author = "Guoliang Li and Haowen Dong and Chao Zhang", title = "Cloud databases: new techniques, challenges, and opportunities", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3758--3761", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554893", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554893", abstract = "As database vendors are increasingly moving towards the cloud data service, i.e., databases as a service (DBaaS), cloud databases have become prevalent. Compared with the early cloud-hosted databases, the new generation \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mhedhbi:2022:MTQ, author = "Amine Mhedhbi and Semih Salihoglu", title = "Modern techniques for querying graph-structured relations: foundations, system implementations, and open challenges", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3762--3765", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554894", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554894", abstract = "The last decade has seen an emergence of numerous specialized graph DBMSs (GDBMSs) as well as graph-optimized extensions of RDBMSs. In addition, several query processing techniques, such as worst-case \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fang:2022:DSD, author = "Yixiang Fang and Wensheng Luo and Chenhao Ma", title = "Densest subgraph discovery on large graphs: applications, challenges, and techniques", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3766--3769", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554895", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554895", abstract = "As one of the most fundamental problems in graph data mining, the densest subgraph discovery (DSD) problem has found a broad spectrum of real applications, such as social network community detection, graph index \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Trummer:2022:BGC, author = "Immanuel Trummer", title = "From {BERT} to {GPT-3} codex: harnessing the potential of very large language models for data management", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3770--3773", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554896", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554896", abstract = "Large language models have recently advanced the state of the art on many natural language processing benchmarks. The newest generation of models can be applied to a variety of tasks with little to no specialized \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2022:PPF, author = "Kaisong Huang and Yuliang He and Tianzheng Wang", title = "The past, present and future of indexing on persistent memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3774--3777", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554897", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554897", abstract = "Persistent memory (PM) based indexing techniques have been proposed to build fast yet persistent indexes that sit on the memory bus. Over the past decade, numerous techniques have been proposed with various \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kaoudi:2022:UDA, author = "Zoi Kaoudi and Jorge-Arnulfo Quian{\'e}-Ruiz", title = "Unified data analytics: state-of-the-art and open problems", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3778--3781", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554898", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554898", abstract = "There is an urgent need for unifying data analytics as more and more application tasks become more complex: Nowadays, it is normal to see tasks performing data preparation, analytical processing, and machine \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2022:BGC, author = "Wenfei Fan", title = "Big graphs: challenges and opportunities", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3782--3797", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554899", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554899", abstract = "Big data is typically characterized with 4V's: Volume, Velocity, Variety and Veracity. When it comes to big graphs, these challenges become even more staggering. Each and every of the 4V's raises new questions, from \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Amer-Yahia:2022:TAP, author = "Sihem Amer-Yahia", title = "Towards {AI-powered} data-driven education", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3798--3806", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554900", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554900", abstract = "Educational platforms are increasingly becoming AI-driven. Besides providing a wide range of course filtering options, personalized recommendations of learning material and teachers are driving today's research. While \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2022:HIN, author = "Yizhou Sun and Jiawei Han and Xifeng Yan and Philip S. Yu and Tianyi Wu", title = "Heterogeneous information networks: the past, the present, and the future", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3807--3811", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554901", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554901", abstract = "In 2011, we proposed PathSim to systematically define and compute similarity between nodes in a heterogeneous information network (HIN), where nodes and links are from different types. In the PathSim paper, we for the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Roy:2022:TIA, author = "Sudeepa Roy", title = "Toward interpretable and actionable data analysis with explanations and causality", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3812--3820", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554902", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554902", abstract = "We live in a world dominated by data, where users from different fields routinely collect, study, and make decisions supported by data. To aid these users, the current trend in data analysis is to design tools that allow \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ozcan:2022:RMD, author = "Fatma {\"O}zcan", title = "Reflections on my data management research journey ({VLDB} women in database research award talk)", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3821--3822", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554903", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554903", abstract = "Data-driven decision making is critical for all kinds of enterprises, public and private. It has been my mission to find more efficient, and effective ways to store, manage, query and analyze data to drive actionable \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mohan:2022:PSF, author = "C. Mohan", title = "{Panel}: startups founded by database researchers", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3823--3825", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554904", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554904", abstract = "This in-person panel, which I will be moderating, will focus on startups founded by worldwide database researchers. The panelists are a set of people with different backgrounds in terms of their geographic locations, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Balazinska:2022:CDS, author = "Magdalena Balazinska and Surajit Chaudhuri and AnHai Doan and Joseph M. Hellerstein and Hanuma Kodavalla and Ippokratis Pandis and Matei Zaharia", title = "Cloud data systems: what are the opportunities for the database research community?", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "12", pages = "3826--3827", month = aug, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3554821.3554905", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:11:07 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3554821.3554905", abstract = "The panel will discuss the research opportunities for the database research community in the context of cloud native data services.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{John:2022:HDD, author = "Sachin Basil John and Christoph Koch", title = "High-Dimensional Data Cubes", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "3828--3840", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565839", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565839", abstract = "This paper introduces an approach to supporting high-dimensional data cubes at interactive query speeds and moderate storage cost. The approach is based on binary(-domain) data cubes that are judiciously partially materialized; the missing information \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ceccarello:2022:FSM, author = "Matteo Ceccarello and Johann Gamper", title = "Fast and Scalable Mining of Time Series Motifs with Probabilistic Guarantees", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "3841--3853", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565840", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565840", abstract = "Mining time series motifs is a fundamental, yet expensive task in exploratory data analytics. In this paper, we therefore propose a fast method to find the top- k motifs with probabilistic guarantees. Our probabilistic approach is based on Locality \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Deutch:2022:FEF, author = "Daniel Deutch and Amir Gilad and Tova Milo and Amit Mualem and Amit Somech", title = "{FEDEX}: an Explainability Framework for Data Exploration Steps", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "3854--3868", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565841", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565841", abstract = "When exploring a new dataset, Data Scientists often apply analysis queries, look for insights in the resulting dataframe, and repeat to apply further queries. We propose in this paper a novel solution that assists data scientists in this laborious \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xekalaki:2022:ETA, author = "Maria Xekalaki and Juan Fumero and Athanasios Stratikopoulos and Katerina Doka and Christos Katsakioris and Constantinos Bitsakos and Nectarios Koziris and Christos Kotselidis", title = "Enabling Transparent Acceleration of Big Data Frameworks Using Heterogeneous Hardware", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "3869--3882", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565842", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565842", abstract = "The ever-increasing demand for high performance Big Data analytics and data processing, has paved the way for heterogeneous hardware accelerators, such as Graphics Processing Units (GPUs) and Field Programmable Gate Arrays (FPGAs), to be integrated into \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fazzone:2022:DPN, author = "Adriano Fazzone and Tommaso Lanciano and Riccardo Denni and Charalampos E. Tsourakakis and Francesco Bonchi", title = "Discovering Polarization Niches via Dense Subgraphs with Attractors and Repulsers", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "3883--3896", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565843", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565843", abstract = "Detecting niches of polarization in social media is a first step towards deploying mitigation strategies and avoiding radicalization. In this paper, we model polarization niches as close-knit dense communities of users, which are under the influence of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lee:2022:SSU, author = "Eunjae Lee and Sam H. Noh and Jiwon Seo", title = "{Sage}: a System for Uncertain Network Analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "3897--3910", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565844", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565844", abstract = "We propose Sage, a system for uncertain network analysis. Algorithms for uncertain network analysis require large amounts of memory and computing resources as they sample a large number of network instances and run analysis on them. Sage makes uncertain \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qin:2022:MBC, author = "Hongchao Qin and Rong-Hua Li and Ye Yuan and Guoren Wang and Lu Qin and Zhiwei Zhang", title = "Mining Bursting Core in Large Temporal Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "3911--3923", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565845", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565845", abstract = "Temporal graphs are ubiquitous. Mining communities that are bursting in a period of time is essential for seeking real emergency events in temporal graphs. Unfortunately, most previous studies on community mining in temporal networks ignore the bursting \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yu:2022:CBL, author = "Xiang Yu and Chengliang Chai and Guoliang Li and Jiabin Liu", title = "Cost-Based or Learning-Based?: a Hybrid Query Optimizer for Query Plan Selection", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "3924--3936", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565846", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565846", abstract = "Traditional cost-based optimizers are efficient and stable to generate optimal plans for simple SQL queries, but they may not generate high-quality plans for complicated queries. Thus learning-based optimizers have been proposed recently that can learn \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Meng:2022:OIA, author = "Jingfan Meng and Huayi Wang and Jun Xu and Mitsunori Ogihara", title = "{ONe Index for All Kernels (ONIAK)}: a Zero Re-Indexing {LSH} Solution to {ANNS-ALT (After Linear Transformation)}", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "3937--3949", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565847", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565847", abstract = "In this work, we formulate and solve a new type of approximate nearest neighbor search (ANNS) problems called ANNS after linear transformation (ALT). In ANNS-ALT, we search for the vector (in a dataset) that, after being linearly transformed by a user-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shi:2022:LIB, author = "Jiachen Shi and Gao Cong and Xiao-Li Li", title = "Learned Index Benefits: Machine Learning Based Index Performance Estimation", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "3950--3962", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565848", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565848", abstract = "Index selection remains one of the most challenging problems in relational database management systems. To find an optimum index configuration for a workload, accurately and efficiently quantifying the benefits of each candidate index configuration is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:ORM, author = "Jiachuan Wang and Peng Cheng and Libin Zheng and Lei Chen and Wenjie Zhang", title = "Online Ridesharing with Meeting Points", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "3963--3975", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565849", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565849", abstract = "Nowadays, ridesharing becomes a popular commuting mode. Dynamically arriving riders post their origins and destinations, then the platform assigns drivers to serve them. In ridesharing, different groups of riders can be served by one driver if their \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bellomarini:2022:EPE, author = "Luigi Bellomarini and Davide Benedetto and Matteo Brandetti and Emanuel Sallinger", title = "Exploiting the Power of Equality-Generating Dependencies in Ontological Reasoning", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "3976--3988", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565850", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565850", abstract = "Equality-generating dependencies (EGDs) allow to fully exploit the power of existential quantification in ontological reasoning settings modeled via Tuple-Generating Dependencies (TGDs), by enabling value-assignment or forcing the equivalence of fresh \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Aamand:2022:NRF, author = "Anders Aamand and Debarati Das and Evangelos Kipouridis and Jakob B. T. Knudsen and Peter M. R. Rasmussen and Mikkel Thorup", title = "No Repetition: Fast and Reliable Sampling with Highly Concentrated Hashing", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "3989--4001", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565851", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565851", abstract = "Stochastic sample-based estimators are among the most fundamental and universally applied tools in statistics. Such estimators are particularly important when processing huge amounts of data, where we need to be able to answer a wide range of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Attouche:2022:WGJ, author = "Lyes Attouche and Mohamed-Amine Baazizi and Dario Colazzo and Giorgio Ghelli and Carlo Sartiani and Stefanie Scherzinger", title = "Witness Generation for {JSON} Schema", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "4002--4014", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565852", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565852", abstract = "JSON Schema is a schema language for JSON documents, based on a complex combination of structural operators, Boolean operators (negation included), and recursive variables. The static analysis of JSON Schema documents comprises practically relevant \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shankar:2022:TOP, author = "Shreya Shankar and Aditya G. Parameswaran", title = "Towards Observability for Production Machine Learning Pipelines", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "4015--4022", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565853", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565853", abstract = "Software organizations are increasingly incorporating machine learning (ML) into their product offerings, driving a need for new data management tools. Many of these tools facilitate the initial development of ML applications, but sustaining these \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lee:2022:DES, author = "Sekwon Lee and Soujanya Ponnapalli and Sharad Singhal and Marcos K. Aguilera and Kimberly Keeton and Vijay Chidambaram", title = "{DINOMO}: an Elastic, Scalable, High-Performance Key-Value Store for Disaggregated Persistent Memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "4023--4037", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565854", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565854", abstract = "We present Dinomo, a novel key-value store for disaggregated persistent memory (DPM). Dinomo is the first key-value store for DPM that simultaneously achieves high common-case performance, scalability, and lightweight online reconfiguration. We observe \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shankar:2022:BCR, author = "Shreya Shankar and Stephen Macke and Sarah Chasins and Andrew Head and Aditya Parameswaran", title = "Bolt-on, Compact, and Rapid Program Slicing for Notebooks", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "4038--4047", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565855", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565855", abstract = "Computational notebooks are commonly used for iterative workflows, such as in exploratory data analysis. This process lends itself to the accumulation of old code and hidden state, making it hard for users to reason about the lineage of, e.g., plots \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2022:FMT, author = "Weijie Sun and Zihuan Xu and Lei Chen", title = "Fairness Matters: a Tit-for-Tat Strategy Against Selfish Mining", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "4048--4061", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565856", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565856", abstract = "The proof-of-work (PoW) based blockchains are more secure nowadays since profit-oriented miners contribute more computing powers in exchange for fair revenues. This virtuous circle only works under an incentive-compatible consensus, which is found to be \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ding:2022:SIO, author = "Jialin Ding and Ryan Marcus and Andreas Kipf and Vikram Nathan and Aniruddha Nrusimha and Kapil Vaidya and Alexander van Renen and Tim Kraska", title = "{SageDB}: an Instance-Optimized Data Analytics System", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "4062--4078", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565857", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565857", abstract = "Modern data systems are typically both complex and general-purpose. They are complex because of the numerous internal knobs and parameters that users need to manually tune in order to achieve good performance; they are general-purpose because they are \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Richly:2022:BCF, author = "Keven Richly and Rainer Schlosser and Martin Boissier", title = "Budget-Conscious Fine-Grained Configuration Optimization for Spatio-Temporal Applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "4079--4092", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565858", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565858", abstract = "Based on the performance requirements of modern spatio-temporal data mining applications, in-memory database systems are often used to store and process the data. To efficiently utilize the scarce DRAM capacities, modern database systems support various \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hsieh:2022:NGC, author = "Cheng-Yu Hsieh and Jieyu Zhang and Alexander Ratner", title = "{Nemo}: Guiding and Contextualizing Weak Supervision for Interactive Data Programming", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "13", pages = "4093--4105", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565838.3565859", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:02 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565838.3565859", abstract = "Weak Supervision (WS) techniques allow users to efficiently create large training datasets by programmatically labeling data with heuristic sources of supervision. While the success of WS relies heavily on the provided labeling heuristics, the process \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Helt:2022:CCC, author = "Jeffrey Helt and Abhinav Sharma and Daniel J. Abadi and Wyatt Lloyd and Jose M. Faleiro", title = "{C5}: cloned concurrency control that always keeps up", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "1", pages = "1--14", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3561261.3561262", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:06:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3561261.3561262", abstract = "Asynchronously replicated primary-backup databases are commonly deployed to improve availability and offload read-only transactions. To both apply replicated writes from the primary and serve read-only transactions, the backups implement a cloned \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:CDS, author = "Ruihong Wang and Jianguo Wang and Stratos Idreos and M. Tamer {\"O}zsu and Walid G. Aref", title = "The case for distributed shared-memory databases with {RDMA}-enabled memory disaggregation", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "1", pages = "15--22", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3561261.3561263", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:06:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3561261.3561263", abstract = "Memory disaggregation (MD) allows for scalable and elastic data center design by separating compute (CPU) from memory. With MD, compute and memory are no longer coupled into the same server box. Instead, they are connected to each other via ultra-fast \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2022:FED, author = "Chenyuan Wu and Mohammad Javad Amiri and Jared Asch and Heena Nagda and Qizhen Zhang and Boon Thau Loo", title = "{FlexChain}: an elastic disaggregated blockchain", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "1", pages = "23--36", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3561261.3561264", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:06:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3561261.3561264", abstract = "While permissioned blockchains enable a family of data center applications, existing systems suffer from imbalanced loads across compute and memory, exacerbating the underutilization of cloud resources. This paper presents FlexChain, a novel \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2022:MNL, author = "Zhen Zhang and Shuai Zheng and Yida Wang and Justin Chiu and George Karypis and Trishul Chilimbi and Mu Li and Xin Jin", title = "{MiCS}: near-linear scaling for training gigantic model on public cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "1", pages = "37--50", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3561261.3561265", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:06:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3561261.3561265", abstract = "Existing general purpose frameworks for gigantic model training, i.e., dense models with billions of parameters, cannot scale efficiently on cloud environment with various networking conditions due to large communication overheads. In this paper, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2022:PPC, author = "Yi Yang and Yurong Cheng and Ye Yuan and Guoren Wang and Lei Chen and Yongjiao Sun", title = "Privacy-preserving cooperative online matching over spatial crowdsourcing platforms", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "1", pages = "51--63", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3561261.3561266", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:06:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3561261.3561266", abstract = "With the continuous development of spatial crowdsourcing platform, online task assignment problem has been widely studied as a typical problem in spatial crowdsourcing. Most of the existing studies are based on a single-platform task assignment to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:CMT, author = "Jiayi Wang and Chengliang Chai and Nan Tang and Jiabin Liu and Guoliang Li", title = "Coresets over multiple tables for feature-rich and data-efficient machine learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "1", pages = "64--76", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3561261.3561267", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:06:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3561261.3561267", abstract = "Successful machine learning (ML) needs to learn from good data. However, one common issue about train data for ML practitioners is the lack of good features. To mitigate this problem, feature augmentation is often employed by joining with (or enriching \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2022:SMM, author = "Zihao Zhang and Huiqi Hu and Xuan Zhou and Jiang Wang", title = "{Starry}: multi-master transaction processing on semi-leader architecture", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "1", pages = "77--89", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3561261.3561268", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:06:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3561261.3561268", abstract = "Multi-master architecture is desirable for cloud databases in supporting large-scale transaction processing. To enable concurrent transaction execution on multiple computing nodes, we need an efficient transaction commit protocol on the storage layer \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Skitsas:2022:SSE, author = "Konstantinos Skitsas and Ioannis G. Papageorgiou and Mohammad Sadegh Talebi and Verena Kantere and Michael N. Katehakis and Panagiotis Karras", title = "{SIFTER}: space-efficient value iteration for finite-horizon {MDPs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "1", pages = "90--98", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3561261.3561269", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:06:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3561261.3561269", abstract = "Can we solve finite-horizon Markov decision processes (FHMDPs) while raising low memory requirements? Such models find application in many cases where a decision-making agent needs to act in a probabilistic environment, from resource management to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yu:2022:TUP, author = "Geoffrey X. Yu and Markos Markakis and Andreas Kipf and Per-{\AA}ke Larson and Umar Farooq Minhas and Tim Kraska", title = "{TreeLine}: an update-in-place key-value store for modern storage", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "1", pages = "99--112", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3561261.3561270", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:06:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3561261.3561270", abstract = "Many modern key-value stores, such as RocksDB, rely on log-structured merge trees (LSMs). Originally designed for spinning disks, LSMs optimize for write performance by only making sequential writes. But this optimization comes at the cost of reads: \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tao:2022:DPE, author = "Yuchao Tao and Amir Gilad and Ashwin Machanavajjhala and Sudeepa Roy", title = "{DPXPlain}: privately explaining aggregate query answers", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "1", pages = "113--126", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3561261.3561271", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Nov 17 11:06:34 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3561261.3561271", abstract = "Differential privacy (DP) is the state-of-the-art and rigorous notion of privacy for answering aggregate database queries while preserving the privacy of sensitive information in the data. In today's era of data analysis, however, it poses new \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chang:2022:EMP, author = "Lijun Chang and Mouyi Xu and Darren Strash", title = "Efficient maximum $k$-plex computation over large sparse graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "127--139", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565817", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565817", abstract = "The k -plex model is a relaxation of the clique model by allowing every vertex to miss up to k neighbors. Designing exact and efficient algorithms for computing a maximum k -plex in a graph has been receiving increasing interest recently. However, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hu:2022:OSE, author = "Tianxun Hu and Tianzheng Wang and Qingqing Zhou", title = "Online schema evolution is (almost) free for snapshot databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "140--153", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565818", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565818", abstract = "Modern database applications often change their schemas to keep up with the changing requirements. However, support for online and transactional schema evolution remains challenging in existing database systems. Specifically, prior work often takes ad \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:LEH, author = "Yifan Wang and Haodi Ma and Daisy Zhe Wang", title = "{LIDER}: an efficient high-dimensional learned index for large-scale dense passage retrieval", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "154--166", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565819", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565819", abstract = "Passage retrieval has been studied for decades, and many recent approaches of passage retrieval are using dense embeddings generated from deep neural models, called ``dense passage retrieval''. The state-of-the-art end-to-end dense passage retrieval \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shaham:2022:MMS, author = "Sina Shaham and Gabriel Ghinita and Cyrus Shahabi", title = "Models and mechanisms for spatial data fairness", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "167--179", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565820", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565820", abstract = "Fairness in data-driven decision-making studies scenarios where individuals from certain population segments may be unfairly treated when being considered for loan or job applications, access to public resources, or other types of services. In location-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2022:IMR, author = "Shixun Huang and Wenqing Lin and Zhifeng Bao and Jiachen Sun", title = "Influence maximization in real-world closed social networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "180--192", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565821", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565821", abstract = "In the last few years, many closed social networks such as WhatsAPP and WeChat have emerged to cater for people's growing demand of privacy and independence. In a closed social network, the posted content is not available to all users or senders can set \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bonifati:2022:TLI, author = "Angela Bonifati and Francesco {Del Buono} and Francesco Guerra and Donato Tiano", title = "{Time2Feat}: learning interpretable representations for multivariate time series clustering", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "193--201", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565822", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565822", abstract = "Clustering multivariate time series is a critical task in many real-world applications involving multiple signals and sensors. Existing systems aim to maximize effectiveness, efficiency and scalability, but fail to guarantee the interpretability of the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2022:OVF, author = "Xiaochen Li and Yuke Hu and Weiran Liu and Hanwen Feng and Li Peng and Yuan Hong and Kui Ren and Zhan Qin", title = "{OpBoost}: a vertical federated tree boosting framework based on order-preserving desensitization", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "202--215", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565823", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565823", abstract = "Vertical Federated Learning (FL) is a new paradigm that enables users with non-overlapping attributes of the same data samples to jointly train a model without directly sharing the raw data. Nevertheless, recent works show that it's still not sufficient \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Perera:2022:HSD, author = "R. Malinga Perera and Bastian Oetomo and Benjamin I. P. Rubinstein and Renata Borovica-Gajic", title = "{HMAB}: self-driving hierarchy of bandits for integrated physical database design tuning", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "216--229", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565824", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565824", abstract = "Effective physical database design tuning requires selection of several physical design structures (PDS), such as indices and materialised views, whose combination influences overall system performance in a non-linear manner. While the simplicity of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Palyvos-Giannas:2022:EEO, author = "Dimitris Palyvos-Giannas and Katerina Tzompanaki and Marina Papatriantafilou and Vincenzo Gulisano", title = "{Erebus}: explaining the outputs of data streaming queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "230--242", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565825", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565825", abstract = "In data streaming, why-provenance can explain why a given outcome is observed but offers no help in understanding why an expected outcome is missing. Explaining missing answers has been addressed in DBMSs, but these solutions are not directly applicable \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2022:PPL, author = "Zhou Zhang and Zhaole Chu and Peiquan Jin and Yongping Luo and Xike Xie and Shouhong Wan and Yun Luo and Xufei Wu and Peng Zou and Chunyang Zheng and Guoan Wu and Andy Rudoff", title = "{PLIN}: a persistent learned index for non-volatile memory with high performance and instant recovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "243--255", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565826", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565826", abstract = "Non-Volatile Memory (NVM) has emerged as an alternative to next-generation main memories. Although many tree indices have been proposed for NVM, they generally use B+-tree-like structures. To further improve the performance of NVM-aware indices, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:FFC, author = "Zuozhi Wang and Shengquan Ni and Avinash Kumar and Chen Li", title = "{Fries}: fast and consistent runtime reconfiguration in dataflow systems with transactional guarantees", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "256--268", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565827", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565827", abstract = "A computing job in a big data system can take a long time to run, especially for pipelined executions on data streams. Developers often need to change the computing logic of the job such as fixing a loophole in an operator or changing the machine \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xiao:2022:FAD, author = "Renjie Xiao and Zijing Tan and Haojin Wang and Shuai Ma", title = "Fast approximate denial constraint discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "269--281", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565828", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565828", abstract = "We investigate the problem of discovering approximate denial constraints (DCs), for finding DCs that hold with some exceptions to avoid overfitting real-life dirty data and facilitate data cleaning tasks. Different methods have been proposed to address \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:FDD, author = "Haoyu Wang and Shaoxu Song", title = "Frequency domain data encoding in {Apache IoTDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "282--290", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565829", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565829", abstract = "Frequency domain analysis is widely conducted on time series. While online transforming from time domain to frequency domain is costly, e.g., by Fast Fourier Transform (FFT), it is highly demanded to store the frequency domain data for reuse. However, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zheng:2022:HMS, author = "Jiping Zheng and Yuan Ma and Wei Ma and Yanhao Wang and Xiaoyang Wang", title = "Happiness maximizing sets under group fairness constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "291--303", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565830", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565830", abstract = "Finding a happiness maximizing set (HMS) from a database, i.e., selecting a small subset of tuples that preserves the best score with respect to any nonnegative linear utility function, is an important problem in multi-criteria decision-making. When an \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Renggli:2022:SEF, author = "Cedric Renggli and Xiaozhe Yao and Luka Kolar and Luka Rimanic and Ana Klimovic and Ce Zhang", title = "{SHiFT}: an efficient, flexible search engine for transfer learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "304--316", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565831", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565831", abstract = "Transfer learning can be seen as a data- and compute-efficient alternative to training models from scratch. The emergence of rich model repositories, such as TensorFlow Hub, enables practitioners and researchers to unleash the potential of these models \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Islam:2022:SCT, author = "Md. Mouinul Islam and Dong Wei and Baruch Schieber and Senjuti Basu Roy", title = "Satisfying complex top-$k$ fairness constraints by preference substitutions", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "317--329", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565832", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565832", abstract = "Given m users (voters), where each user casts her preference for a single item (candidate) over n items (candidates) as a ballot, the preference aggregation problem returns k items (candidates) that have the k highest number of preferences (votes). Our \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Karpov:2022:SSE, author = "Nikolai Karpov and Qin Zhang", title = "{SyncSignature}: a simple, efficient, parallelizable framework for tree similarity joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "330--342", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565833", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565833", abstract = "This paper introduces SyncSignature, the first fully parallelizable algorithmic framework for tree similarity joins under edit distance. SyncSignature makes use of implicit-synchronized signature generation schemes, which allow for an efficient and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2022:APG, author = "Shuang Yang and Yahui Sun and Jiesong Liu and Xiaokui Xiao and Rong-Hua Li and Zhewei Wei", title = "Approximating probabilistic group {Steiner} trees in graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "343--355", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565834", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565834", abstract = "Consider an edge-weighted graph, and a number of properties of interests (PoIs). Each vertex has a probability of exhibiting each PoI. The joint probability that a set of vertices exhibits a PoI is the probability that this set contains at least one \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Papadias:2022:SER, author = "Serafeim Papadias and Zoi Kaoudi and Jorge-Arnulfo Quian{\'e}-Ruiz and Volker Markl", title = "Space-efficient random walks on streaming graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "356--368", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565835", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565835", abstract = "Graphs in many applications, such as social networks and IoT, are inherently streaming, involving continuous additions and deletions of vertices and edges at high rates. Constructing random walks in a graph, i.e., sequences of vertices selected with a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:PPT, author = "Pengfei Wang and Xiaocan Zeng and Lu Chen and Fan Ye and Yuren Mao and Junhao Zhu and Yunjun Gao", title = "{PromptEM}: prompt-tuning for low-resource generalized entity matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "369--378", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565836", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565836", abstract = "Entity Matching (EM), which aims to identify whether two entity records from two relational tables refer to the same real-world entity, is one of the fundamental problems in data management. Traditional EM assumes that two tables are homogeneous with \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Guo:2022:CAC, author = "Zhihan Guo and Xinyu Zeng and Kan Wu and Wuh-Chwen Hwang and Ziwei Ren and Xiangyao Yu and Mahesh Balakrishnan and Philip A. Bernstein", title = "{Cornus}: atomic commit for a cloud {DBMS} with storage disaggregation", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "2", pages = "379--392", month = oct, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3565816.3565837", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Nov 25 08:53:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3565816.3565837", abstract = "Two-phase commit (2PC) is widely used in distributed databases to ensure atomicity of distributed transactions. Conventional 2PC was originally designed for the shared-nothing architecture and has two limitations: long latency due to two eager log \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yuan:2022:RTT, author = "Haitao Yuan and Guoliang Li and Zhifeng Bao", title = "Route Travel Time Estimation on a Road Network Revisited: Heterogeneity, Proximity, Periodicity and Dynamicity", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "3", pages = "393--405", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3570690.3570691", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:37 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3570690.3570691", abstract = "In this paper, we revisit the problem of route travel time estimation on a road network and aim to boost its accuracy by capturing and utilizing spatio-temporal features from four significant aspects: heterogeneity, proximity, periodicity and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2022:SOM, author = "Yongji Wu and Matthew Lentz and Danyang Zhuo and Yao Lu", title = "Serving and Optimizing Machine Learning Workflows on Heterogeneous Infrastructures", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "3", pages = "406--419", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3570690.3570692", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:37 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3570690.3570692", abstract = "With the advent of ubiquitous deployment of smart devices and the Internet of Things, data sources for machine learning inference have increasingly moved to the edge of the network. Existing machine learning inference platforms typically assume a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Geng:2022:CRB, author = "Zixuan Geng and Maximilian Schleich and Dan Suciu", title = "Computing Rule-Based Explanations by Leveraging Counterfactuals", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "3", pages = "420--432", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3570690.3570693", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:37 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3570690.3570693", abstract = "Sophisticated machine models are increasingly used for high-stakes decisions in everyday life. There is an urgent need to develop effective explanation techniques for such automated decisions. Rule-Based Explanations have been proposed for high-stake \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Peng:2022:SSI, author = "Jinfeng Peng and Derong Shen and Nan Tang and Tieying Liu and Yue Kou and Tiezheng Nie and Hang Cui and Ge Yu", title = "Self-Supervised and Interpretable Data Cleaning with Sequence Generative Adversarial Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "3", pages = "433--446", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3570690.3570694", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:37 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3570690.3570694", abstract = "We study the problem of self-supervised and interpretable data cleaning, which automatically extracts interpretable data repair rules from dirty data. In this paper, we propose a novel framework, namely Garf, based on sequence generative adversarial \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Romero:2022:OVA, author = "Francisco Romero and Johann Hauswald and Aditi Partap and Daniel Kang and Matei Zaharia and Christos Kozyrakis", title = "Optimizing Video Analytics with Declarative Model Relationships", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "3", pages = "447--460", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3570690.3570695", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:37 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3570690.3570695", abstract = "The availability of vast video collections and the accuracy of ML models has generated significant interest in video analytics systems. Since naively processing all frames using expensive models is impractical, researchers have proposed optimizations \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jiang:2022:SRT, author = "Jiaxin Jiang and Yuan Li and Bingsheng He and Bryan Hooi and Jia Chen and Johan Kok Zhi Kang", title = "{Spade}: a Real-Time Fraud Detection Framework on Evolving Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "3", pages = "461--469", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3570690.3570696", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:37 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3570690.3570696", abstract = "Real-time fraud detection is a challenge for most financial and electronic commercial platforms. To identify fraudulent communities, Grab, one of the largest technology companies in Southeast Asia, forms a graph from a set of transactions and detects \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Miao:2022:GET, author = "Xupeng Miao and Yujie Wang and Youhe Jiang and Chunan Shi and Xiaonan Nie and Hailin Zhang and Bin Cui", title = "{Galvatron}: Efficient Transformer Training over Multiple {GPUs} Using Automatic Parallelism", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "3", pages = "470--479", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3570690.3570697", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:37 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3570690.3570697", abstract = "Transformer models have achieved state-of-the-art performance on various domains of applications and gradually becomes the foundations of the advanced large deep learning (DL) models. However, how to train these models over multiple GPUs efficiently is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:IDL, author = "Qitong Wang and Stephen Whitmarsh and Vincent Navarro and Themis Palpanas", title = "{iEDeaL}: a Deep Learning Framework for Detecting Highly Imbalanced Interictal Epileptiform Discharges", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "3", pages = "480--490", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3570690.3570698", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:37 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3570690.3570698", abstract = "Epilepsy is a chronic neurological disease, ranked as the second most burdensome neurological disorder worldwide. Detecting Interictal Epileptiform Discharges (IEDs) is among the most important clinician operations to support epilepsy diagnosis, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zapridou:2022:DLP, author = "Eleni Zapridou and Ioannis Mytilinis and Anastasia Ailamaki", title = "{Dalton}: Learned Partitioning for Distributed Data Streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "3", pages = "491--504", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3570690.3570699", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:37 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3570690.3570699", abstract = "To sustain the input rate of high-throughput streams, modern stream processing systems rely on parallel execution. However, skewed data yield imbalanced load assignments and create stragglers that hinder scalability Deciding on a static partitioning for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Behrouz:2022:FCS, author = "Ali Behrouz and Farnoosh Hashemi and Laks V. S. Lakshmanan", title = "{FirmTruss} Community Search in Multilayer Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "3", pages = "505--518", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3570690.3570700", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:37 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3570690.3570700", abstract = "In applications such as biological, social, and transportation networks, interactions between objects span multiple aspects. For accurately modeling such applications, multilayer networks have been proposed. Community search allows for personalized \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xu:2022:ETC, author = "Tianyang Xu and Zhao Lu and Yuanyuan Zhu", title = "Efficient Triangle-Connected Truss Community Search in Dynamic Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "3", pages = "519--531", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3570690.3570701", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:37 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3570690.3570701", abstract = "Community search studies the retrieval of certain community structures containing query vertices, which has received lots of attention recently. k -truss is a fundamental community structure where each edge is contained in at least k --- 2 triangles. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sabek:2022:CLM, author = "Ibrahim Sabek and Kapil Vaidya and Dominik Horn and Andreas Kipf and Michael Mitzenmacher and Tim Kraska", title = "Can Learned Models Replace Hash Functions?", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "3", pages = "532--545", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3570690.3570702", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:37 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3570690.3570702", abstract = "Hashing is a fundamental operation in database management, playing a key role in the implementation of numerous core database data structures and algorithms. Traditional hash functions aim to mimic a function that maps a key to a random value, which can \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2022:TGA, author = "Yue Zhao and George H. Chen and Zhihao Jia", title = "{TOD}: {GPU}-Accelerated Outlier Detection via Tensor Operations", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "3", pages = "546--560", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3570690.3570703", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:37 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3570690.3570703", abstract = "Outlier detection (OD) is a key machine learning task for finding rare and deviant data samples, with many time-critical applications such as fraud detection and intrusion detection. In this work, we propose TOD, the first tensor-based system for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ma:2022:FFL, author = "Chaohong Ma and Xiaohui Yu and Yifan Li and Xiaofeng Meng and Aishan Maoliniyazi", title = "{FILM}: a Fully Learned Index for Larger-Than-Memory Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "3", pages = "561--573", month = nov, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3570690.3570704", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:37 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3570690.3570704", abstract = "As modern applications generate data at an unprecedented speed and often require the querying/analysis of data spanning a large duration, it is crucial to develop indexing techniques that cater to larger-than-memory databases, where data reside on \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mazmudar:2022:CMI, author = "Miti Mazmudar and Thomas Humphries and Jiaxiang Liu and Matthew Rafuse and Xi He", title = "Cache Me If You Can: Accuracy-Aware Inference Engine for Differentially Private Data Exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "574--586", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574246", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574246", abstract = "Differential privacy (DP) allows data analysts to query databases that contain users' sensitive information while providing a quantifiable privacy guarantee to users. Recent interactive DP systems such as APEx provide accuracy guarantees over the query \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Falzon:2022:RSE, author = "Francesca Falzon and Evangelia Anna Markatou and Zachary Espiritu and Roberto Tamassia", title = "Range Search over Encrypted Multi-Attribute Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "587--600", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574247", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574247", abstract = "This work addresses expressive queries over encrypted data by presenting the first systematic study of multi-attribute range search on a symmetrically encrypted database outsourced to an honest-but-curious server. Prior work includes a thorough analysis \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ren:2022:HMA, author = "Xuanle Ren and Le Su and Zhen Gu and Sheng Wang and Feifei Li and Yuan Xie and Song Bian and Chao Li and Fan Zhang", title = "{HEDA}: Multi-Attribute Unbounded Aggregation over Homomorphically Encrypted Database", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "601--614", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574248", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574248", abstract = "Recent years have witnessed the rapid development of the encrypted database, due to the increasing number of data privacy breaches and the corresponding laws and regulations that caused millions of dollars in loss. These encrypted databases may rely on \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shen:2022:DPG, author = "Chih-Ya Shen and Shao-Heng Ko and Guang-Siang Lee and Wang-Chien Lee and De-Nian Yang", title = "Density Personalized Group Query", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "615--628", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574249", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574249", abstract = "Research on new queries for finding dense subgraphs and groups has been actively pursued due to their many applications, especially in social network analysis and graph mining. However, existing work faces two major weaknesses: (i) incapability of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Geng:2022:NDH, author = "Jinkun Geng and Anirudh Sivaraman and Balaji Prabhakar and Mendel Rosenblum", title = "{Nezha}: Deployable and High-Performance Consensus Using Synchronized Clocks", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "629--642", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574250", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574250", abstract = "This paper presents a high-performance consensus protocol, Nezha, which can be deployed by cloud tenants without support from cloud providers. Nezha bridges the gap between protocols such as Multi-Paxos and Raft, which can be readily deployed, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ahmad:2022:PPR, author = "Ishtiyaque Ahmad and Divyakant Agrawal and Amr {El Abbadi} and Trinabh Gupta", title = "{Pantheon}: Private Retrieval from Public Key--Value Store", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "643--656", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574251", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574251", abstract = "Consider a cloud server that owns a key-value store and provides a private query service to its clients. Preserving client privacy in this setting is difficult because the key-value store is public, and a client cannot encrypt or modify it. Therefore, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{DaDalt:2022:BSV, author = "Francesco {Da Dalt} and Simon Scherrer and Adrian Perrig", title = "{Bayesian} Sketches for Volume Estimation in Data Streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "657--669", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574252", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574252", abstract = "Given large data streams of items, each attributable to a certain key and possessing a certain volume, the aggregate volume associated with a key is difficult to estimate in a way that is both efficient and accurate. On the one hand, exact counting with \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Moti:2022:WWA, author = "Moin Hussain Moti and Panagiotis Simatis and Dimitris Papadias", title = "{Waffle}: a Workload-Aware and Query-Sensitive Framework for Disk-Based Spatial Indexing", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "670--683", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574253", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574253", abstract = "Although several spatial indexes achieve fast query processing, they are ineffective for highly dynamic data sets because of costly updates. On the other hand, simple structures that enable efficient updates are slow for spatial queries. In this paper, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pena:2022:FAD, author = "Eduardo H. M. Pena and Fabio Porto and Felix Naumann", title = "Fast Algorithms for Denial Constraint Discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "684--696", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574254", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574254", abstract = "Denial constraints (DCs) are an integrity constraint formalism widely used to detect inconsistencies in data. Several algorithms have been devised to discover DCs from data, as manually specifying them is burdensome and, worse yet, error-prone. The \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jiao:2022:TQI, author = "Pu Jiao and Sheng Di and Hanqi Guo and Kai Zhao and Jiannan Tian and Dingwen Tao and Xin Liang and Franck Cappello", title = "Toward Quantity-of-Interest Preserving Lossy Compression for Scientific Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "697--710", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574255", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574255", abstract = "Today's scientific simulations and instruments are producing a large amount of data, leading to difficulties in storing, transmitting, and analyzing these data. While error-controlled lossy compressors are effective in significantly reducing data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Demirci:2022:SGC, author = "Gunduz Vehbi Demirci and Aparajita Haldar and Hakan Ferhatosmanoglu", title = "Scalable Graph Convolutional Network Training on Distributed-Memory Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "711--724", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574256", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574256", abstract = "Graph Convolutional Networks (GCNs) are extensively utilized for deep learning on graphs. The large data sizes of graphs and their vertex features make scalable training algorithms and distributed memory systems necessary. Since the convolution \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schafer:2022:MSA, author = "Patrick Sch{\"a}fer and Ulf Leser", title = "{Motiflets}: Simple and Accurate Detection of Motifs in Time Series", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "725--737", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574257", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574257", abstract = "A time series motif intuitively is a short time series that repeats itself approximately the same within a larger time series. Such motifs often represent concealed structures, such as heart beats in an ECG recording, the riff in a pop song, or sleep \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Narayan:2022:CFM, author = "Avanika Narayan and Ines Chami and Laurel Orr and Christopher R{\'e}", title = "Can Foundation Models Wrangle Your Data?", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "738--746", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574258", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574258", abstract = "Foundation Models (FMs) are models trained on large corpora of data that, at very large scale, can generalize to new tasks without any task-specific finetuning. As these models continue to grow in size, innovations continue to push the boundaries of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kim:2022:MDB, author = "Bogyeong Kim and Kyoseung Koo and Undraa Enkhbat and Sohyun Kim and Juhun Kim and Bongki Moon", title = "{M2Bench}: a Database Benchmark for Multi-Model Analytic Workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "747--759", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574259", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574259", abstract = "As the world becomes increasingly data-centric, the tasks dealt with by a database management system (DBMS) become more complex and diverse. Compared with traditional workloads that typically require only a single data model, modern-day computational \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Baruah:2022:POD, author = "Nirvik Baruah and Peter Kraft and Fiodar Kazhamiaka and Peter Bailis and Matei Zaharia", title = "Parallelism-Optimizing Data Placement for Faster Data-Parallel Computations", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "760--771", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574260", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574260", abstract = "Systems performing large data-parallel computations, including online analytical processing (OLAP) systems like Druid and search engines like Elasticsearch, are increasingly being used for business-critical real-time applications where providing low \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lazebnik:2022:SSB, author = "Teddy Lazebnik and Amit Somech and Abraham Itzhak Weinberg", title = "{SubStrat}: a Subset-Based Optimization Strategy for Faster {AutoML}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "772--780", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574261", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574261", abstract = "Automated machine learning (AutoML) frameworks have become important tools in the data scientist's arsenal, as they dramatically reduce the manual work devoted to the construction of ML pipelines. Such frameworks intelligently search among millions of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gurukar:2022:MWS, author = "Saket Gurukar and Nikil Pancha and Andrew Zhai and Eric Kim and Samson Hu and Srinivasan Parthasarathy and Charles Rosenberg and Jure Leskovec", title = "{MultiBiSage}: a {Web}-Scale Recommendation System Using Multiple Bipartite Graphs at {Pinterest}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "781--789", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574262", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574262", abstract = "Graph Convolutional Networks (GCN) can efficiently integrate graph structure and node features to learn high-quality node embeddings. At Pinterest, we have developed and deployed PinSage, a data-efficient GCN that learns pin embeddings from the Pin-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zeakis:2022:TEF, author = "Alexandros Zeakis and Dimitrios Skoutas and Dimitris Sacharidis and Odysseas Papapetrou and Manolis Koubarakis", title = "{TokenJoin}: Efficient Filtering for Set Similarity Join with Maximum Weighted Bipartite Matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "790--802", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574263", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574263", abstract = "Set similarity join is an important problem with many applications in data discovery, cleaning and integration. To increase robustness, fuzzy set similarity join calculates the similarity of two sets based on maximum weighted bipartite matching instead \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kayali:2022:QSC, author = "Moe Kayali and Dan Suciu", title = "Quasi-Stable Coloring for Graph Compression: Approximating Max-Flow, Linear Programs, and Centrality", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "803--815", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574264", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574264", abstract = "We propose quasi-stable coloring, an approximate version of stable coloring. Stable coloring, also called color refinement, is a well-studied technique in graph theory for classifying vertices, which can be used to build compact, lossless \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pujol:2022:MAD, author = "David Pujol and Albert Sun and Brandon Fain and Ashwin Machanavajjhala", title = "Multi-Analyst Differential Privacy for Online Query Answering", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "816--828", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574265", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574265", abstract = "Most differentially private mechanisms are designed for the use of a single analyst. In reality, however, there are often multiple stakeholders with different and possibly conflicting priorities that must share the same privacy loss budget. This \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gubner:2022:EVM, author = "Tim Gubner and Peter Boncz", title = "{Excalibur}: a Virtual Machine for Adaptive Fine-grained {JIT}-Compiled Query Execution based on {VOILA}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "829--841", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574266", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574266", abstract = "In recent years, hardware has become increasingly diverse, in terms of features as well as performance. This poses a problem for complex software in general and database systems in particular. To achieve top-notch performance, we need to exploit \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qin:2022:ADO, author = "Lianke Qin and Rajesh Jayaram and Elaine Shi and Zhao Song and Danyang Zhuo and Shumo Chu", title = "{Adore}: Differentially Oblivious Relational Database Operators", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "842--855", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574267", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574267", abstract = "There has been a recent effort in applying differential privacy on memory access patterns to enhance data privacy. This is called differential obliviousness. Differential obliviousness is a promising direction because it provides a principled trade-off \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Laddad:2022:KCC, author = "Shadaj Laddad and Conor Power and Mae Milano and Alvin Cheung and Natacha Crooks and Joseph M. Hellerstein", title = "Keep {CALM} and {CRDT} On", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "856--863", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574268", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574268", abstract = "Despite decades of research and practical experience, developers have few tools for programming reliable distributed applications without resorting to expensive coordination techniques. Conflict-free replicated datatypes (CRDTs) are a promising line of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lu:2022:MLS, author = "Kejing Lu and Yoshiharu Ishikawa and Chuan Xiao", title = "{MQH}: Locality Sensitive Hashing on Multi-level Quantization Errors for Point-to-Hyperplane Distances", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "864--876", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574269", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574269", abstract = "Point-to-hyperplane nearest neighbor search (P2HNNS) is a fundamental problem which has many applications in data mining and machine learning. In this paper, we propose a provable Locality-Sensitive-Hashing (LSH) scheme based on multi-level quantization \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Szarnyas:2022:LSN, author = "G{\'a}bor Sz{\'a}rnyas and Jack Waudby and Benjamin A. Steer and D{\'a}vid Szak{\'a}llas and Altan Birler and Mingxi Wu and Yuchen Zhang and Peter Boncz", title = "The {LDBC} Social Network Benchmark: Business Intelligence Workload", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "877--890", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574270", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574270", abstract = "The Social Network Benchmark's Business Intelligence workload (SNB BI) is a comprehensive graph OLAP benchmark targeting analytical data systems capable of supporting graph workloads. This paper marks the finalization of almost a decade of research in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{An:2022:MCM, author = "Shuai An and Yang Cao", title = "Making Cache Monotonic and Consistent", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "891--904", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574271", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574271", abstract = "We propose monotonic consistent caching (MCC), a cache scheme for applications that demand consistency and monotonicity. MCC warrants that a transaction-like request always sees a consistent view of the backend database and observed writes over the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wei:2022:SPE, author = "Ziyun Wei and Immanuel Trummer", title = "{SkinnerMT}: Parallelizing for Efficiency and Robustness in Adaptive Query Processing on Multicore Platforms", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "905--917", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574272", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574272", abstract = "SkinnerMT is an adaptive query processing engine, specialized for multi-core platforms. SkinnerMT features different strategies for parallel processing that allow users to trade between average run time and performance robustness. First, SkinnerMT \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ding:2022:EAQ, author = "Dujian Ding and Sihem Amer-Yahia and Laks Lakshmanan", title = "On Efficient Approximate Queries over Machine Learning Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "918--931", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574273", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574273", abstract = "The question of answering queries over ML predictions has been gaining attention in the database community. This question is challenging because finding high quality answers by invoking an oracle such as a human expert or an expensive deep neural \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Khatiwada:2022:IDL, author = "Aamod Khatiwada and Roee Shraga and Wolfgang Gatterbauer and Ren{\'e}e J. Miller", title = "Integrating Data Lake Tables", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "932--945", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574274", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574274", abstract = "We have made tremendous strides in providing tools for data scientists to discover new tables useful for their analyses. But despite these advances, the proper integration of discovered tables has been under-explored. An interesting semantics for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kang:2022:PTS, author = "Hongbo Kang and Yiwei Zhao and Guy E. Blelloch and Laxman Dhulipala and Yan Gu and Charles McGuffey and Phillip B. Gibbons", title = "{PIM-Tree}: a Skew-Resistant Index for Processing-in-Memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "946--958", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574275", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:39 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3574245.3574275", abstract = "The performance of today's in-memory indexes is bottlenecked by the memory latency/bandwidth wall. Processing-in-memory (PIM) is an emerging approach that potentially mitigates this bottleneck, by enabling low-latency memory access whose aggregate \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2022:WRE, author = "Zhijia Chen and Weiyi Meng and Eduard Dragut", title = "{Web} Record Extraction with Invariants", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "959--972", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574276", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:29:39 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Web records are structured data on a Web page that embeds records retrieved from an underlying database according to some templates. Mining data records on the Web enables the integration of data from multiple Web sites for providing value-added \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2022:DGM, author = "Yong Wang and Guoliang Li and Kaiyu Li and Haitao Yuan", title = "A Deep Generative Model for Trajectory Modeling and Utilization", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "973--985", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574277", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:29:39 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern location-based systems have stimulated explosive growth of urban trajectory data and promoted many real-world applications, e.g., trajectory prediction. However, heavy big data processing overhead and privacy concerns hinder trajectory \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xu:2022:LTH, author = "Zihuan Xu and Lei Chen", title = "{L2chain}: Towards High-performance, Confidential and Secure Layer-2 Blockchain Solution for Decentralized Applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "4", pages = "986--999", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.14778/3574245.3574278", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:29:39 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the rapid development of blockchain, the concept of decentralized applications (DApps), built upon smart contracts, has attracted much attention in academia and industry. However, significant issues w.r.t. system throughput, transaction \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:ATR, author = "Mingxuan Li and Yazhe Wang and Shuai Ma and Chao Liu and Dongdong Huo and Yu Wang and Zhen Xu", title = "Auto-Tuning with Reinforcement Learning for Permissioned Blockchain Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "5", pages = "1000--1012", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3579075.3579076", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:40 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3579075.3579076", abstract = "In a permissioned blockchain, performance dictates its development, which is substantially influenced by its parameters. However, research on auto-tuning for better performance has somewhat stagnated because of the difficulty posed by distributed \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xie:2023:PSH, author = "Minhui Xie and Youyou Lu and Qing Wang and Yangyang Feng and Jiaqiang Liu and Kai Ren and Jiwu Shu", title = "{PetPS}: Supporting Huge Embedding Models with Persistent Memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "5", pages = "1013--1022", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3579075.3579077", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:40 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3579075.3579077", abstract = "Embedding models are effective for learning high-dimensional sparse data. Traditionally, they are deployed in DRAM parameter servers (PS) for online inference access. However, the ever-increasing model capacity makes this practice suffer from both high \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rabbani:2023:EVS, author = "Kashif Rabbani and Matteo Lissandrini and Katja Hose", title = "Extraction of Validating Shapes from Very Large Knowledge Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "5", pages = "1023--1032", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3579075.3579078", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:40 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3579075.3579078", abstract = "Knowledge Graphs (KGs) represent heterogeneous domain knowledge on the Web and within organizations. There exist shapes constraint languages to define validating shapes to ensure the quality of the data in KGs. Existing techniques to extract validating \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pang:2023:AFM, author = "Pu Pang and Gang Deng and Kaihao Bai and Quan Chen and Shixuan Sun and Bo Liu and Yu Xu and Hongbo Yao and Zhengheng Wang and Xiyu Wang and Zheng Liu and Zhuo Song and Yong Yang and Tao Ma and Minyi Guo", title = "{Async-Fork}: Mitigating Query Latency Spikes Incurred by the Fork-based Snapshot Mechanism from the {OS} Level", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "5", pages = "1033--1045", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3579075.3579079", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:40 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3579075.3579079", abstract = "In-memory key-value stores (IMKVSes) serve many online applications. They generally adopt the fork-based snapshot mechanism to support data backup. However, this method can result in query latency spikes because the engine is out-of-service for queries \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2023:CPJ, author = "Qichen Wang and Xiao Hu and Binyang Dai and Ke Yi", title = "Change Propagation Without Joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "5", pages = "1046--1058", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3579075.3579080", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:40 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3579075.3579080", abstract = "We revisit the classical change propagation framework for query evaluation under updates. The standard framework takes a query plan and materializes the intermediate views, which incurs high polynomial costs in both space and time, with the join \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xie:2023:FFF, author = "Yuexiang Xie and Zhen Wang and Dawei Gao and Daoyuan Chen and Liuyi Yao and Weirui Kuang and Yaliang Li and Bolin Ding and Jingren Zhou", title = "{FederatedScope}: a Flexible Federated Learning Platform for Heterogeneity", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "5", pages = "1059--1072", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3579075.3579081", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:40 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3579075.3579081", abstract = "Although remarkable progress has been made by existing federated learning (FL) platforms to provide infrastructures for development, these platforms may not well tackle the challenges brought by various types of heterogeneity. To fill this gap, in this \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:AAC, author = "Boyang Li and Yurong Cheng and Ye Yuan and Yi Yang and QianQian Jin and Guoren Wang", title = "{ACTA}: Autonomy and Coordination Task Assignment in Spatial Crowdsourcing Platforms", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "5", pages = "1073--1085", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3579075.3579082", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:40 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3579075.3579082", abstract = "Spatial platforms have become increasingly important in people's daily lives. Task assignment is a critical problem in these platforms that matches real-time orders to suitable workers. Most studies only focus on independent platforms that are in a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Um:2023:FAD, author = "Taegeon Um and Byungsoo Oh and Byeongchan Seo and Minhyeok Kweun and Goeun Kim and Woo-Yeon Lee", title = "{FastFlow}: Accelerating Deep Learning Model Training with Smart Offloading of Input Data Pipeline", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "5", pages = "1086--1099", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3579075.3579083", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:40 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3579075.3579083", abstract = "When training a deep learning (DL) model, input data are pre-processed on CPUs and transformed into tensors, which are then fed into GPUs for gradient computations of model training. Expensive GPUs must be fully utilized during training to accelerate \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2023:FFM, author = "Xi Zhao and Bolong Zheng and Xiaomeng Yi and Xiaofan Luan and Charles Xie and Xiaofang Zhou and Christian S. Jensen", title = "{FARGO}: Fast Maximum Inner Product Search via Global Multi-Probing", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "5", pages = "1100--1112", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3579075.3579084", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:40 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3579075.3579084", abstract = "Maximum inner product search (MIPS) in high-dimensional spaces has wide applications but is computationally expensive due to the curse of dimensionality. Existing studies employ asymmetric transformations that reduce the MIPS problem to a nearest \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kiefer:2023:ODP, author = "Martin Kiefer and Ilias Poulakis and Eleni Tzirita Zacharatou and Volker Markl", title = "Optimistic Data Parallelism for {FPGA}-Accelerated Sketching", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "5", pages = "1113--1125", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3579075.3579085", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:40 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3579075.3579085", abstract = "Sketches are a popular approximation technique for large datasets and high-velocity data streams. While custom FPGA-based hardware has shown admirable throughput at sketching, the state-of-the-art exploits data parallelism by fully replicating resources \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Arcolezi:2023:RCM, author = "H{\'e}ber H. Arcolezi and S{\'e}bastien Gambs and Jean-Fran{\c{c}}ois Couchot and Catuscia Palamidessi", title = "On the Risks of Collecting Multidimensional Data Under Local Differential Privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "5", pages = "1126--1139", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3579075.3579086", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:40 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3579075.3579086", abstract = "The private collection of multiple statistics from a population is a fundamental statistical problem. One possible approach to realize this is to rely on the local model of differential privacy (LDP). Numerous LDP protocols have been developed for the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chatzakis:2023:OJL, author = "Manos Chatzakis and Panagiota Fatourou and Eleftherios Kosmas and Themis Palpanas and Botao Peng", title = "{Odyssey}: a Journey in the Land of Distributed Data Series Similarity Search", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "5", pages = "1140--1153", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3579075.3579087", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:40 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3579075.3579087", abstract = "This paper presents Odyssey, a novel distributed data-series processing framework that efficiently addresses the critical challenges of exhibiting good speedup and ensuring high scalability in data series processing by taking advantage of the full \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fang:2023:AER, author = "Lanting Fang and Kaiyu Feng and Jie Gui and Shanshan Feng and Aiqun Hu", title = "Anonymous Edge Representation for Inductive Anomaly Detection in Dynamic Bipartite Graph", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "5", pages = "1154--1167", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3579075.3579088", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:40 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3579075.3579088", abstract = "The activities in many real-world applications, such as e-commerce and online education, are usually modeled as a dynamic bipartite graph that evolves over time. It is a critical task to detect anomalies inductively in a dynamic bipartite graph. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2023:STR, author = "Junyong Yang and Ming Zhong and Yuanyuan Zhu and Tieyun Qian and Mengchi Liu and Jeffrey Xu Yu", title = "Scalable Time-Range $k$-Core Query on Temporal Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "5", pages = "1168--1180", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3579075.3579089", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:40 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3579075.3579089", abstract = "Querying cohesive subgraphs on temporal graphs with various time constraints has attracted intensive research interests recently. In this paper, we study a novel Temporal k -Core Query (TCQ) problem: given a time interval, find all distinct k -cores that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2023:HPR, author = "Erkang Zhu and Silu Huang and Surajit Chaudhuri", title = "High-Performance Row Pattern Recognition Using Joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "5", pages = "1181--1195", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3579075.3579090", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:40 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3579075.3579090", abstract = "The SQL standard introduced MATCH_RECOGNIZE in 2016 for row pattern recognition. Since then, MATCH_RECOGNIZE has been supported by several leading relation systems, they implemented this function using Non-Deterministic Finite Automaton (NFA). While NFA \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Luo:2023:HGA, author = "Kelin Luo and Alexandre M. Florio and Syamantak Das and Xiangyu Guo", title = "A Hierarchical Grouping Algorithm for the Multi-Vehicle Dial-a-Ride Problem", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "5", pages = "1195--1207", month = jan, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3579075.3579091", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 11 08:12:40 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3579075.3579091", abstract = "Ride-sharing is an essential aspect of modern urban mobility. In this paper, we consider a classical problem in ride-sharing --- the Multi-Vehicle Dial-a-Ride Problem (Multi-Vehicle DaRP). Given a fleet of vehicles with a fixed capacity stationed at \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2023:LAD, author = "Xiaoxuan Liu and Shuxian Wang and Mengzhu Sun and Sicheng Pan and Ge Li and Siddharth Jha and Cong Yan and Junwen Yang and Shan Lu and Alvin Cheung", title = "Leveraging Application Data Constraints to Optimize Database-Backed {Web} Applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1208--1221", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583141", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583141", abstract = "Exploiting the relationships among data is a classical query optimization technique. As persistent data is increasingly being created and maintained programmatically, prior work that infers data relationships from data statistics misses an important \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gruber:2023:BCD, author = "Ferdinand Gruber and Maximilian Bandle and Alexis Engelke and Thomas Neumann and Jana Giceva", title = "Bringing Compiling Databases to {RISC} Architectures", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1222--1234", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583142", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583142", abstract = "Current hardware development greatly influences the design decisions of modern database systems. For many modern performance-focused database systems, query compilation emerged as an integral part and different approaches for code generation evolved, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cha:2023:BLH, author = "Hokeun Cha and Xiangpeng Hao and Tianzheng Wang and Huanchen Zhang and Aditya Akella and Xiangyao Yu", title = "{B$^{\rm link}$-hash}: an Adaptive Hybrid Index for In-Memory Time-Series Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1235--1248", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583143", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583143", abstract = "High-speed data ingestion is critical in time-series workloads that are driven by the growth of Internet of Things (IoT) applications. We observe that traditional tree-based indexes encounter severe scalability bottlenecks for time-series workloads that insert monotonically increasing timestamp keys into an index; all insertions go to a small memory region that sees extremely high contention.\par In this work, we present a new index design, Blink-hash, that enhances a tree-based index with hash leaf nodes to mitigate the contention of monotonic insertions --- insertions go to random locations within a hash node (which is much larger than a B+-tree node) to reduce conflicts. We develop further optimizations (median approximation and lazy split) to accelerate hash node splits. We also develop structure adaptation optimizations to dynamically convert a hash node to B+-tree nodes for good scan performance. Our evaluation shows that Blink-hash achieves up to 91.3$ \times $ higher throughput than conventional indexes in a time-series workload that monotonically inserts timestamps into an index, while showing comparable scan performance to a well-optimized B+-tree.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2023:DSE, author = "Wentao Huang and Yunhong Ji and Xuan Zhou and Bingsheng He and Kian-Lee Tan", title = "A Design Space Exploration and Evaluation for Main-Memory Hash Joins in Storage Class Memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1249--1263", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583144", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583144", abstract = "In this paper, we seek to perform a rigorous experimental study of main-memory hash joins in storage class memory (SCM). In particular, we perform a design space exploration in real SCM for two state-of-the-art join algorithms: partitioned hash join \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2023:EBB, author = "Kaile Huang and Si Liu and Zhenge Chen and Hengfeng Wei and David Basin and Haixiang Li and Anqun Pan", title = "Efficient Black-Box Checking of Snapshot Isolation in Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1264--1276", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583145", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583145", abstract = "Snapshot isolation (SI) is a prevalent weak isolation level that avoids the performance penalty imposed by serializability and simultaneously prevents various undesired data anomalies. Nevertheless, SI anomalies have recently been found in production \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:DPV, author = "Zitao Li and Tianhao Wang and Ninghui Li", title = "Differentially Private Vertical Federated Clustering", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1277--1290", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583146", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583146", abstract = "In many applications, multiple parties have private data regarding the same set of users but on disjoint sets of attributes, and a server wants to leverage the data to train a model. To enable model learning while protecting the privacy of the data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2023:PCT, author = "Fuheng Zhao and Punnal Ismail Khan and Divyakant Agrawal and Amr {El Abbadi} and Arpit Gupta and Zaoxing Liu", title = "{Panakos}: Chasing the Tails for Multidimensional Data Streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1291--1304", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583147", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583147", abstract = "System operators are often interested in extracting different feature streams from multi-dimensional data streams; and reporting their distributions at regular intervals, including the heavy hitters that contribute to the tail portion of the feature \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Furst:2023:VOM, author = "Jonathan F{\"u}rst and Mauricio Fadel Argerich and Bin Cheng", title = "{VersaMatch}: Ontology Matching with Weak Supervision", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1305--1318", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583148", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583148", abstract = "Ontology matching is crucial to data integration for across-silo data sharing and has been mainly addressed with heuristic and machine learning (ML) methods. While heuristic methods are often inflexible and hard to extend to new domains, ML methods rely \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2023:RRT, author = "Yushi Sun and Hao Xin and Lei Chen", title = "{RECA}: Related Tables Enhanced Column Semantic Type Annotation Framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1319--1331", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583149", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583149", abstract = "Understanding the semantics of tabular data is of great importance in various downstream applications, such as schema matching, data cleaning, and data integration. Column semantic type annotation is a critical task in the semantic understanding of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:ZWT, author = "Yiming Li and Yanyan Shen and Lei Chen and Mingxuan Yuan", title = "{Zebra}: When Temporal Graph Neural Networks Meet Temporal Personalized {PageRank}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1332--1345", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583150", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583150", abstract = "Temporal graph neural networks (T-GNNs) are state-of-the-art methods for learning representations over dynamic graphs. Despite the superior performance, T-GNNs still suffer from high computational complexity caused by the tedious recursive temporal \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Feng:2023:EAC, author = "Su Feng and Boris Glavic and Oliver Kennedy", title = "Efficient Approximation of Certain and Possible Answers for Ranking and Window Queries over Uncertain Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1346--1358", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583151", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583151", abstract = "Uncertainty arises naturally in many application domains due to, e.g., data entry errors and ambiguity in data cleaning. Prior work in incomplete and probabilistic databases has investigated the semantics and efficient evaluation of ranking and top-k \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yue:2023:GEV, author = "Cong Yue and Tien Tuan Anh Dinh and Zhongle Xie and Meihui Zhang and Gang Chen and Beng Chin Ooi and Xiaokui Xiao", title = "{GlassDB}: an Efficient Verifiable Ledger Database System Through Transparency", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1359--1371", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583152", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583152", abstract = "Verifiable ledger databases protect data history against malicious tampering. Existing systems, such as blockchains and certificate transparency, are based on transparency logs --- a simple abstraction allowing users to verify that a log maintained by \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:EDT, author = "Qian Zhang and Jingyao Li and Hongyao Zhao and Quanqing Xu and Wei Lu and Jinliang Xiao and Fusheng Han and Chuanhui Yang and Xiaoyong Du", title = "Efficient Distributed Transaction Processing in Heterogeneous Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1372--1385", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583153", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583153", abstract = "Countrywide and worldwide business, like gaming and social networks, drives the popularity of inter-data-center transactions. To support inter-data-center transaction processing and data center fault tolerance simultaneously, existing protocols suffer \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jiang:2023:ASE, author = "Zhiguo Jiang and Hanhua Chen and Hai Jin", title = "{Auxo}: a Scalable and Efficient Graph Stream Summarization Structure", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1386--1398", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583154", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583154", abstract = "A graph stream refers to a continuous stream of edges, forming a huge and fast-evolving graph. The vast volume and high update speed of a graph stream bring stringent requirements for the data management structure, including sublinear space cost, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{He:2023:OOS, author = "Xiao He and Ye Li and Jian Tan and Bin Wu and Feifei Li", title = "{OneShotSTL}: One-Shot Seasonal-Trend Decomposition For Online Time Series Anomaly Detection And Forecasting", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1399--1412", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583155", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583155", abstract = "Seasonal-trend decomposition is one of the most fundamental concepts in time series analysis that supports various downstream tasks, including time series anomaly detection and forecasting. However, existing decomposition methods rely on batch \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{vanRenen:2023:CAB, author = "Alexander van Renen and Viktor Leis", title = "Cloud Analytics Benchmark", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1413--1425", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583156", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583156", abstract = "The cloud facilitates the transition to a service-oriented perspective. This affects cloud-native data management in general, and data analytics in particular. Instead of managing a multi-node database cluster on-premise, end users simply send queries \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Alhomssi:2023:SRS, author = "Adnan Alhomssi and Viktor Leis", title = "Scalable and Robust Snapshot Isolation for High-Performance Storage Engines", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1426--1438", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583157", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583157", abstract = "MVCC-based snapshot isolation promises that read queries can proceed without interfering with concurrent writes. However, as we show experimentally, in existing implementations a single long-running query can easily cause transactional throughput to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:FFS, author = "Xiang Li and Fabing Li and Mingyu Gao", title = "{Flare}: a Fast, Secure, and Memory-Efficient Distributed Analytics Framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1439--1452", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583158", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583158", abstract = "As big data processing in the cloud becomes prevalent today, data privacy on such public platforms raises critical concerns. Hardware-based trusted execution environments (TEEs) provide promising and practical platforms for low-cost privacy-preserving \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{An:2023:NSB, author = "Mijin An and Jonghyeok Park and Tianzheng Wang and Beomseok Nam and Sang-Won Lee", title = "{NV-SQL}: Boosting {OLTP} Performance with Non-Volatile {DIMMs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1453--1465", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583159", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583159", abstract = "When running OLTP workloads, relational DBMSs with flash SSDs still suffer from the durability overhead. Heavy writes to SSD not only limit the performance but also shorten the storage lifespan. To mitigate the durability overhead, this paper proposes a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2023:LLR, author = "Rong Zhu and Wei Chen and Bolin Ding and Xingguang Chen and Andreas Pfadler and Ziniu Wu and Jingren Zhou", title = "{Lero}: a Learning-to-Rank Query Optimizer", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1466--1479", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583160", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583160", abstract = "A recent line of works apply machine learning techniques to assist or rebuild cost-based query optimizers in DBMS. While exhibiting superiority in some benchmarks, their deficiencies, e.g., unstable performance, high training cost, and slow model \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lee:2023:DCS, author = "Kitaek Lee and Insoon Jo and Jaechan Ahn and Hyuk Lee and Hwang Lee and Woong Sul and Hyungsoo Jung", title = "Deploying Computational Storage for {HTAP DBMSs} Takes More Than Just Computation Offloading", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1480--1493", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583161", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583161", abstract = "Hybrid transactional/analytical processing (HTAP) would overload database systems. To alleviate performance interference between transactions and analytics, recent research pursues the potential of in-storage processing (ISP) using commodity \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tang:2023:TPC, author = "Dixin Tang and Alan Fekete and Indranil Gupta and Aditya G. Parameswaran", title = "Transactional Panorama: a Conceptual Framework for User Perception in Analytical Visual Interfaces", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1494--1506", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583162", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583162", abstract = "Many tools empower analysts and data scientists to consume analysis results in a visual interface. When the underlying data changes, these results need to be updated, but this update can take a long time---all while the user continues to explore the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Paulsen:2023:SSY, author = "Derek Paulsen and Yash Govind and AnHai Doan", title = "{Sparkly}: a Simple yet Surprisingly Strong {TF\slash IDF} Blocker for Entity Matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1507--1519", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583163", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583163", abstract = "Blocking is a major task in entity matching. Numerous blocking solutions have been developed, but as far as we can tell, blocking using the well-known tf/idf measure has received virtually no attention. Yet, when we experimented with tf/idf blocking \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Negi:2023:RQD, author = "Parimarjan Negi and Ziniu Wu and Andreas Kipf and Nesime Tatbul and Ryan Marcus and Sam Madden and Tim Kraska and Mohammad Alizadeh", title = "Robust Query Driven Cardinality Estimation under Changing Workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1520--1533", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583164", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583164", abstract = "Query driven cardinality estimation models learn from a historical log of queries. They are lightweight, having low storage requirements, fast inference and training, and are easily adaptable for any kind of query. Unfortunately, such models can suffer \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fu:2023:CTR, author = "Han Fu and Chang Liu and Bin Wu and Feifei Li and Jian Tan and Jianling Sun", title = "{CatSQL}: Towards Real World Natural Language to {SQL} Applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1534--1547", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583165", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583165", abstract = "Natural language to SQL (NL2SQL) techniques provide a convenient interface to access databases, especially for non-expert users, to conduct various data analytics. Existing methods often employ either a rule-base approach or a deep learning based \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Azizi:2023:EGB, author = "Ilias Azizi and Karima Echihabi and Themis Palpanas", title = "{ELPIS}: Graph-Based Similarity Search for Scalable Data Science", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1548--1559", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583166", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583166", abstract = "The recent popularity of learned embeddings has fueled the growth of massive collections of high-dimensional (high-d) vectors that model complex data. Finding similar vectors in these collections is at the core of many important and practical data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Duffy:2023:DKV, author = "Carl Duffy and Jaehoon Shim and Sang-Hoon Kim and Jin-Soo Kim", title = "{Dotori}: a Key--Value {SSD} Based {KV} Store", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1560--1572", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583167", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583167", abstract = "Key-value SSDs (KVSSDs) represent a major shift in the storage stack design, with numerous potential benefits. Despite this, their lack of native features critical to operation in real world scenarios hinders their adoption, and these benefits go \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pujol:2023:PPG, author = "David Pujol and Amir Gilad and Ashwin Machanavajjhala", title = "{PreFair}: Privately Generating Justifiably Fair Synthetic Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1573--1586", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583168", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583168", abstract = "When a database is protected by Differential Privacy (DP), its usability is limited in scope. In this scenario, generating a synthetic version of the data that mimics the properties of the private data allows users to perform any operation on the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shraga:2023:EDC, author = "Roee Shraga and Ren{\'e}e J. Miller", title = "Explaining Dataset Changes for Semantic Data Versioning with {Explain-Da-V}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "6", pages = "1587--1600", month = feb, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3583140.3583169", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 1 07:43:11 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3583140.3583169", abstract = "In multi-user environments in which data science and analysis is collaborative, multiple versions of the same datasets are generated. While managing and storing data versions has received some attention in the research literature, the semantic nature of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Budiu:2023:DAI, author = "Mihai Budiu and Tej Chajed and Frank McSherry and Leonid Ryzhyk and Val Tannen", title = "{DBSP}: Automatic Incremental View Maintenance for Rich Query Languages", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "7", pages = "1601--1614", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3587136.3587137", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 9 09:08:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3587136.3587137", abstract = "Incremental view maintenance (IVM) has long been a central problem in database theory. Many solutions have been proposed for restricted classes of database languages, such as the relational algebra, or Datalog. These techniques do not naturally \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liang:2023:SSP, author = "Ling Liang and Jilan Lin and Zheng Qu and Ishtiyaque Ahmad and Fengbin Tu and Trinabh Gupta and Yufei Ding and Yuan Xie", title = "{SPG}: Structure-Private Graph Database via {SqueezePIR}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "7", pages = "1615--1628", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3587136.3587138", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 9 09:08:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3587136.3587138", abstract = "Many relational data in our daily life are represented as graphs, making graph application an important workload. Because of the large scale of graph datasets, moving graph data to the cloud becomes a popular option. To keep the confidential and private \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:IES, author = "Jingyuan Zhang and Ao Wang and Xiaolong Ma and Benjamin Carver and Nicholas John Newman and Ali Anwar and Lukas Rupprecht and Vasily Tarasov and Dimitrios Skourtis and Feng Yan and Yue Cheng", title = "{InfiniStore}: Elastic Serverless Cloud Storage", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "7", pages = "1629--1642", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3587136.3587139", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 9 09:08:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3587136.3587139", abstract = "Cloud object storage such as AWS S3 is cost-effective and highly elastic but relatively slow, while high-performance cloud storage such as AWS ElastiCache is expensive and provides limited elasticity. We present a new cloud storage service called \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fang:2023:DGE, author = "Peng Fang and Arijit Khan and Siqiang Luo and Fang Wang and Dan Feng and Zhenli Li and Wei Yin and Yuchao Cao", title = "Distributed Graph Embedding with Information-Oriented Random Walks", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "7", pages = "1643--1656", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3587136.3587140", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 9 09:08:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3587136.3587140", abstract = "Graph embedding maps graph nodes to low-dimensional vectors, and is widely adopted in machine learning tasks. The increasing availability of billion-edge graphs underscores the importance of learning efficient and effective embeddings on large graphs, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zheng:2023:SSV, author = "Shuyuan Zheng and Yang Cao and Masatoshi Yoshikawa", title = "Secure {Shapley} Value for Cross-Silo Federated Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "7", pages = "1657--1670", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3587136.3587141", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 9 09:08:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3587136.3587141", abstract = "The Shapley value (SV) is a fair and principled metric for contribution evaluation in cross-silo federated learning (cross-silo FL), wherein organizations, i.e., clients, collaboratively train prediction models with the coordination of a parameter \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:SSF, author = "Xiang Li and Nuozhou Sun and Yunqian Luo and Mingyu Gao", title = "{SODA}: a Set of Fast Oblivious Algorithms in Distributed Secure Data Analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "7", pages = "1671--1684", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3587136.3587142", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 9 09:08:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3587136.3587142", abstract = "Cloud systems are now a prevalent platform to host large-scale big-data analytics applications such as machine learning and relational database. However, data privacy remains as a critical concern for public cloud systems. Existing trusted hardware \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hong:2023:GSB, author = "Zicong Hong and Song Guo and Enyuan Zhou and Wuhui Chen and Huawei Huang and Albert Zomaya", title = "{GriDB}: Scaling Blockchain Database via Sharding and Off-Chain Cross-Shard Mechanism", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "7", pages = "1685--1698", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3587136.3587143", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 9 09:08:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3587136.3587143", abstract = "Blockchain databases have attracted widespread attention but suffer from poor scalability due to underlying non-scalable blockchains. While blockchain sharding is necessary for a scalable blockchain database, it poses a new challenge named on-chain \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jian:2023:SAS, author = "Xun Jian and Zhiyuan Li and Lei Chen", title = "{SUFF}: Accelerating Subgraph Matching with Historical Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "7", pages = "1699--1711", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3587136.3587144", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 9 09:08:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3587136.3587144", abstract = "Subgraph matching is a fundamental problem in graph theory and has wide applications in areas like sociology, chemistry, and social networks. Due to its NP-hardness, the basic approach is a brute-force search over the whole search space. Some pruning \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{He:2023:WDM, author = "Haochen He and Erci Xu and Shanshan Li and Zhouyang Jia and Si Zheng and Yue Yu and Jun Ma and Xiangke Liao", title = "When Database Meets New Storage Devices: Understanding and Exposing Performance Mismatches via Configurations", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "7", pages = "1712--1725", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3587136.3587145", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 9 09:08:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3587136.3587145", abstract = "NVMe SSD hugely boosts the I/O speed, with up to GB/s throughput and microsecond-level latency. Unfortunately, DBMS users can often find their high-performanced storage devices tend to deliver less-than-expected or even worse performance when compared \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2023:SAD, author = "Grace Fan and Jin Wang and Yuliang Li and Dan Zhang and Ren{\'e}e J. Miller", title = "Semantics-Aware Dataset Discovery from Data Lakes with Contextualized Column-Based Representation Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "7", pages = "1726--1739", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3587136.3587146", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 9 09:08:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3587136.3587146", abstract = "Dataset discovery from data lakes is essential in many real application scenarios. In this paper, we propose Starmie, an end-to-end framework for dataset discovery from data lakes (with table union search as the main use case). Our proposed framework \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mortensen:2023:MEM, author = "Kasper Overgaard Mortensen and Fatemeh Zardbani and Mohammad Ahsanul Haque and Steinn Ymir Agustsson and Davide Mottin and Philip Hofmann and Panagiotis Karras", title = "{Marigold}: Efficient $k$-Means Clustering in High Dimensions", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "7", pages = "1740--1748", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3587136.3587147", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 9 09:08:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3587136.3587147", abstract = "How can we efficiently and scalably cluster high-dimensional data? The k -means algorithm clusters data by iteratively reducing intra-cluster Euclidean distances until convergence. While it finds applications from recommendation engines to image \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sabek:2023:CLM, author = "Ibrahim Sabek and Tim Kraska", title = "The Case for Learned In-Memory Joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "7", pages = "1749--1762", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3587136.3587148", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 9 09:08:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3587136.3587148", abstract = "In-memory join is an essential operator in any database engine. It has been extensively investigated in the database literature. In this paper, we study whether exploiting the CDF-based learned models to boost the join performance is practical. To the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:EEB, author = "Ruiyuan Li and Zheng Li and Yi Wu and Chao Chen and Yu Zheng", title = "{Elf}: Erasing-Based Lossless Floating-Point Compression", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "7", pages = "1763--1776", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3587136.3587149", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 9 09:08:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3587136.3587149", abstract = "There are a prohibitively large number of floating-point time series data generated at an unprecedentedly high rate. An efficient, compact and lossless compression for time series data is of great importance for a wide range of scenarios. Most existing lossless floating-point compression methods are based on the XOR operation, but they do not fully exploit the trailing zeros, which usually results in an unsatisfactory compression ratio. This paper proposes an Erasing-based Lossless Floating-point compression algorithm, i.e., \pkg{Elf}. The main idea of \pkg{Elf} is to erase the last few bits (i.e., set them to zero) of floating-point values, so the XORed values are supposed to contain many trailing zeros. The challenges of the erasing-based method are three-fold. First, how to quickly determine the erased bits? Second, how to losslessly recover the original data from the erased ones? Third, how to compactly encode the erased data? Through rigorous mathematical analysis, \pkg{Elf} can directly determine the erased bits and restore the original values without losing any precision. To further improve the compression ratio, we propose a novel encoding strategy for the XORed values with many trailing zeros. \pkg{Elf} works in a streaming fashion. It takes only $ O(N) $ (where $N$ is the length of a time series) in time and $ O(1)$ in space, and achieves a notable compression ratio with a theoretical guarantee. Extensive experiments using 22 datasets show the powerful performance of \pkg{Elf} compared with 9 advanced competitors.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2023:LLO, author = "Tianyi Chen and Jun Gao and Hedui Chen and Yaofeng Tu", title = "{LOGER}: a Learned Optimizer Towards Generating Efficient and Robust Query Execution Plans", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "7", pages = "1777--1789", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3587136.3587150", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 9 09:08:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3587136.3587150", abstract = "Query optimization based on deep reinforcement learning (DRL) has become a hot research topic recently. Despite the achieved promising progress, DRL optimizers still face great challenges of robustly producing efficient plans, due to the vast search \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Martens:2023:RPG, author = "Wim Martens and Matthias Niewerth and Tina Popp and Carlos Rojas and Stijn Vansummeren and Domagoj Vrgoc", title = "Representing Paths in Graph Database Pattern Matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "7", pages = "1790--1803", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3587136.3587151", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue May 9 09:08:30 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3587136.3587151", abstract = "Modern graph database query languages such as GQL, SQL/PGQ, and their academic predecessor G-Core promote paths to first-class citizens in the sense that their pattern matching facility can return paths, as opposed to only nodes and edges. This is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:ZVE, author = "Xiling Li and Chenkai Weng and Yongxin Xu and Xiao Wang and Jennie Rogers", title = "{ZKSQL}: Verifiable and Efficient Query Evaluation with Zero-Knowledge Proofs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "1804--1816", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594513", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594513", abstract = "Individuals and organizations are using databases to store personal information at an unprecedented rate. This creates a quandary for data providers. They are responsible for protecting the privacy of individuals described in their database. On the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Piao:2023:CGE, author = "Chengzhi Piao and Tingyang Xu and Xiangguo Sun and Yu Rong and Kangfei Zhao and Hong Cheng", title = "Computing Graph Edit Distance via Neural Graph Matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "1817--1829", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594514", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594514", abstract = "Graph edit distance (GED) computation is a fundamental NP-hard problem in graph theory. Given a graph pair ( G$_1$, G$_2$ ), GED is defined as the minimum number of primitive operations converting G$_1$ to G$_2$. Early studies focus on search-based inexact algorithms \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schaler:2023:BUE, author = "Christine Sch{\"a}ler and Thomas H{\"u}tter and Martin Sch{\"a}ler", title = "Benchmarking the Utility of $w$-Event Differential Privacy Mechanisms --- When Baselines Become Mighty Competitors", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "1830--1842", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594515", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594515", abstract = "The w -event framework is the current standard for ensuring differential privacy on continuously monitored data streams. Following the proposition of w -event differential privacy, various mechanisms to implement the framework are proposed. Their \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Augustine:2023:CGA, author = "Eriq Augustine and Lise Getoor", title = "Collective Grounding: Applying Database Techniques to Grounding Templated Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "1843--1855", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594516", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594516", abstract = "The process of instantiating, or ``grounding'', a first-order model is a fundamental component of reasoning in logic. It has been widely studied in the context of theorem proving, database theory, and artificial intelligence. Within the relational \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Adams:2023:EEP, author = "Jan Niklas Adams and Cameron Pitsch and Tobias Brockhoff and Wil M. P. van der Aalst", title = "An Experimental Evaluation of Process Concept Drift Detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "1856--1869", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594517", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594517", abstract = "Process mining provides techniques to learn models from event data. These models can be descriptive (e.g., Petri nets) or predictive (e.g., neural networks). The learned models offer operational support to process owners by conformance checking, process \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Vitagliano:2023:PDL, author = "Gerardo Vitagliano and Mazhar Hameed and Lan Jiang and Lucas Reisener and Eugene Wu and Felix Naumann", title = "{Pollock}: a Data Loading Benchmark", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "1870--1882", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594518", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594518", abstract = "Any system at play in a data-driven project has a fundamental requirement: the ability to load data. The de-facto standard format to distribute and consume raw data is csv. Yet, the plain text and flexible nature of this format make such files often \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xiao:2023:APL, author = "Yingtai Xiao and Guanhong Wang and Danfeng Zhang and Daniel Kifer", title = "Answering Private Linear Queries Adaptively Using the Common Mechanism", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "1883--1896", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594519", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594519", abstract = "When analyzing confidential data through a privacy filter, a data scientist often needs to decide which queries will best support their intended analysis. For example, an analyst may wish to study noisy two-way marginals in a dataset produced by a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Du:2023:LLD, author = "Yuntao Du and Yujia Hu and Zhikun Zhang and Ziquan Fang and Lu Chen and Baihua Zheng and Yunjun Gao", title = "{LDPTrace}: Locally Differentially Private Trajectory Synthesis", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "1897--1909", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594520", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594520", abstract = "Trajectory data has the potential to greatly benefit a wide-range of real-world applications, such as tracking the spread of the disease through people's movement patterns and providing personalized location-based services based on travel preference. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kitsios:2023:SPH, author = "Xenophon Kitsios and Panagiotis Liakos and Katia Papakonstantinopoulou and Yannis Kotidis", title = "{Sim-Piece}: Highly Accurate Piecewise Linear Approximation through Similar Segment Merging", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "1910--1922", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594521", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594521", abstract = "Approximating series of timestamped data points using a sequence of line segments with a maximum error guarantee is a fundamental data compression problem, termed as piecewise linear approximation (PLA). Due to the increasing need to analyze massive \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Marinelli:2023:TMF, author = "Eugenio Marinelli and Yiqing Yan and Virginie Magnone and Charlotte Dumargne and Pascal Barbry and Thomas Heinis and Raja Appuswamy", title = "Towards Migration-Free {``Just-in-Case''} Data Archival for Future Cloud Data Lakes Using Synthetic {DNA}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "1923--1929", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594522", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594522", abstract = "Given the growing adoption of AI, cloud data lakes are facing the need to support cost-effective ``just-in-case'' data archival over long time periods to meet regulatory compliance requirements. Unfortunately, current media technologies suffer from \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dong:2023:FGR, author = "Zhiyuan Dong and Zhaoguo Wang and Xiaodong Zhang and Xian Xu and Changgeng Zhao and Haibo Chen and Aurojit Panda and Jinyang Li", title = "Fine-Grained Re-Execution for Efficient Batched Commit of Distributed Transactions", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "1930--1943", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594523", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594523", abstract = "Distributed transaction systems incur extensive cross-node communication to execute and commit serializable OLTP transactions. As a result, their performance greatly suffers. Caching data at nodes that execute transactions can cut down remote reads. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2023:LDT, author = "Wenfei Fan and Resul Tugay and Yaoshu Wang and Min Xie and Muhammad Asif Ali", title = "Learning and Deducing Temporal Orders", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "1944--1957", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594524", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594524", abstract = "This paper studies how to determine temporal orders on attribute values in a set of tuples that pertain to the same entity, in the absence of complete timestamps. We propose a creator-critic framework to learn and deduce temporal orders by combining \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2023:BBG, author = "Xu Chen and Zhen Wang and Shuncheng Liu and Yaliang Li and Kai Zeng and Bolin Ding and Jingren Zhou and Han Su and Kai Zheng", title = "{BASE}: Bridging the Gap between Cost and Latency for Query Optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "1958--1966", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594525", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594525", abstract = "Some recent works have shown the advantages of reinforcement learning (RL) based learned query optimizers. These works often use the cost (i.e., the estimation of cost model) or the latency (i.e., execution time) as guidance signals for training their \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lemiesz:2023:EFO, author = "Jakub Lemiesz", title = "Efficient Framework for Operating on Data Sketches", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "1967--1978", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594526", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594526", abstract = "We study the problem of analyzing massive data streams based on concise data sketches. Recently, a number of papers have investigated how to estimate the results of set-theory operations based on sketches. In this paper we present a framework that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2023:TEI, author = "Xi Zhao and Yao Tian and Kai Huang and Bolong Zheng and Xiaofang Zhou", title = "Towards Efficient Index Construction and Approximate Nearest Neighbor Search in High-Dimensional Spaces", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "1979--1991", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594527", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594527", abstract = "The approximate nearest neighbor (ANN) search in high-dimensional spaces is a fundamental but computationally very expensive problem. Many methods have been designed for solving the ANN problem, such as LSH-based methods and graph-based methods. The LSH-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2023:LIC, author = "Zhaoyan Sun and Xuanhe Zhou and Guoliang Li", title = "Learned Index: a Comprehensive Experimental Evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "1992--2004", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594528", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594528", abstract = "Indexes can improve query-processing performance by avoiding full table scans. Although traditional indexes (e.g., B+-tree) have been widely used, learned indexes are proposed to adopt machine learning models to reduce the query latency and index size. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:LIG, author = "Yanping Zhang and Johes Bater and Kartik Nayak and Ashwin Machanavajjhala", title = "{Longshot}: Indexing Growing Databases Using {MPC} and Differential Privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "2005--2018", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594529", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594529", abstract = "In this work, we propose Longshot, a novel design for secure outsourced database systems that supports ad-hoc queries through the use of secure multi-party computation and differential privacy. By combining these two techniques, we build and maintain \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Paparrizos:2023:ASS, author = "John Paparrizos and Kaize Wu and Aaron Elmore and Christos Faloutsos and Michael J. Franklin", title = "Accelerating Similarity Search for Elastic Measures: a Study and New Generalization of Lower Bounding Distances", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "2019--2032", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594530", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594530", abstract = "Similarity search is a core analytical task, and its performance critically depends on the choice of distance measure. For time-series querying, elastic measures achieve state-of-the-art accuracy but are computationally expensive. Thus, fast lower \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2023:ALA, author = "Chenyuan Wu and Bhavana Mehta and Mohammad Javad Amiri and Ryan Marcus and Boon Thau Loo", title = "{AdaChain}: a Learned Adaptive Blockchain", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "2033--2046", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594531", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594531", abstract = "This paper presents AdaChain, a learning-based blockchain framework that adaptively chooses the best permissioned blockchain architecture to optimize effective throughput for dynamic transaction workloads. AdaChain addresses the challenge in Blockchain \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2023:ICS, author = "Yingli Zhou and Yixiang Fang and Wensheng Luo and Yunming Ye", title = "Influential Community Search over Large Heterogeneous Information Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "8", pages = "2047--2060", month = apr, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3594512.3594532", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Jun 23 11:11:42 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3594512.3594532", abstract = "Recently, the topic of influential community search has gained much attention. Given a graph, it aims to find communities of vertices with high importance values from it. Existing works mainly focus on conventional homogeneous networks, where vertices \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Arafat:2023:NBH, author = "Naheed Anjum Arafat and Arijit Khan and Arpit Kumar Rai and Bishwamittra Ghosh", title = "Neighborhood-Based Hypergraph Core Decomposition", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2061--2074", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598582", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598582", abstract = "We propose neighborhood-based core decomposition: a novel way of decomposing hypergraphs into hierarchical neighborhood-cohesive subhypergraphs. Alternative approaches to decomposing hypergraphs, e.g., reduction to clique or bipartite graphs, are not \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Layne:2023:TSG, author = "Janet Layne and Justin Carpenter and Edoardo Serra and Francesco Gullo", title = "Temporal {SIR-GN}: Efficient and Effective Structural Representation Learning for Temporal Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2075--2089", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598583", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598583", abstract = "Node representation learning (NRL) generates numerical vectors (embeddings) for the nodes of a graph. Structural NRL specifically assigns similar node embeddings for those nodes that exhibit similar structural roles. This is in contrast with its \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Haas:2023:WMN, author = "Gabriel Haas and Viktor Leis", title = "What Modern {NVMe} Storage Can Do, and How to Exploit it: High-Performance {I/O} for High-Performance Storage Engines", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2090--2102", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598584", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598584", abstract = "NVMe SSDs based on flash are cheap and offer high throughput. Combining several of these devices into a single server enables 10 million I/O operations per second or more. Our experiments show that existing out-of-memory database systems and storage engines achieve only a fraction of this performance. In this work, we demonstrate that it is possible to close the performance gap between hardware and software through an I/O optimized storage engine design. In a heavy out-of-memory setting, where the dataset is 10 times larger than main memory, our system can achieve more than 1 million TPC-C transactions per second.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Banakar:2023:WES, author = "Vinay Banakar and Kan Wu and Yuvraj Patel and Kimberly Keeton and Andrea C. Arpaci-Dusseau and Remzi H. Arpaci-Dusseau", title = "{WiscSort}: External Sorting for Byte-Addressable Storage", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2103--2116", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598585", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598585", abstract = "We present WiscSort, a new approach to high-performance concurrent sorting for existing and future byte-addressable storage (BAS) devices. WiscSort carefully reduces writes, exploits random reads by splitting keys and values during sorting, and performs \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ayad:2023:TIL, author = "Lorraine A. K. Ayad and Grigorios Loukides and Solon P. Pissis", title = "Text Indexing for Long Patterns: Anchors are All you Need", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2117--2131", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598586", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598586", abstract = "In many real-world database systems, a large fraction of the data is represented by strings: sequences of letters over some alphabet. This is because strings can easily encode data arising from different sources. It is often crucial to represent such \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Afroozeh:2023:FCL, author = "Azim Afroozeh and Peter Boncz", title = "The {FastLanes} Compression Layout: Decoding $ > 100 $ Billion Integers per Second with Scalar Code", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2132--2144", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598587", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598587", abstract = "The open-source FastLanes project aims to improve big data formats, such as Parquet, ORC and columnar database formats, in multiple ways. In this paper, we significantly accelerate decoding of all common Light-Weight Compression (LWC) schemes: DICT, FOR,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yue:2023:VAP, author = "Cong Yue and Meihui Zhang and Changhao Zhu and Gang Chen and Dumitrel Loghin and Beng Chin Ooi", title = "{VeriBench}: Analyzing the Performance of Database Systems with Verifiability", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2145--2157", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598588", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598588", abstract = "Database systems are paying more attention to data security in recent years. Immutable systems such as blockchains, verifiable databases, and ledger databases are equipped with various verifiability mechanisms to protect data. Such systems often adopt \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:TDL, author = "Jiangneng Li and Zheng Wang and Gao Cong and Cheng Long and Han Mao Kiah and Bin Cui", title = "Towards Designing and Learning Piecewise Space-Filling Curves", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2158--2171", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598589", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598589", abstract = "To index multi-dimensional data, space-filling curves (SFCs) have been used to map the data to one dimension, and then a one-dimensional indexing method such as the B-tree is used to index the mapped data. The existing SFCs all adopt a single mapping \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2023:MQB, author = "Xiaoke Zhu and Yang Liu and Shuhao Liu and Wenfei Fan", title = "{MiniGraph}: Querying Big Graphs with a Single Machine", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2172--2185", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598590", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598590", abstract = "This paper presents MiniGraph, an out-of-core system for querying big graphs with a single machine. As opposed to previous single-machine graph systems, MiniGraph proposes a pipelined architecture to overlap I/O and CPU operations, and improves multi-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Choi:2023:BEC, author = "Yunyoung Choi and Kunsoo Park and Hyunjoon Kim", title = "{BICE}: Exploring Compact Search Space by Using Bipartite Matching and Cell-Wide Verification", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2186--2198", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598591", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598591", abstract = "Subgraph matching is the problem of searching for all embeddings of a query graph in a data graph, and subgraph query processing (also known as subgraph search) is to find all the data graphs that contain a query graph as subgraphs. Extensive research \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tian:2023:MDT, author = "Anxin Tian and Alexander Zhou and Yue Wang and Lei Chen", title = "Maximal {D}-Truss Search in Dynamic Directed Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2199--2211", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598592", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598592", abstract = "Community search (CS) aims at personalized subgraph discovery which is the key to understanding the organisation of many real-world networks. CS in undirected networks has attracted significant attention from researchers, including many solutions for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:DDD, author = "Pengfei Li and Hua Lu and Rong Zhu and Bolin Ding and Long Yang and Gang Pan", title = "{DILI}: a Distribution-Driven Learned Index", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2212--2224", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598593", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598593", abstract = "Targeting in-memory one-dimensional search keys, we propose a novel DIstribution-driven Learned Index tree (DILI), where a concise and computation-efficient linear regression model is used for each node. An internal node's key range is equally divided \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zeakis:2023:PTE, author = "Alexandros Zeakis and George Papadakis and Dimitrios Skoutas and Manolis Koubarakis", title = "Pre-Trained Embeddings for Entity Resolution: an Experimental Analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2225--2238", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598594", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598594", abstract = "Many recent works on Entity Resolution (ER) leverage Deep Learning techniques involving language models to improve effectiveness. This is applied to both main steps of ER, i.e., blocking and matching. Several pre-trained embeddings have been tested, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zheng:2023:DGN, author = "Yanping Zheng and Zhewei Wei and Jiajun Liu", title = "Decoupled Graph Neural Networks for Large Dynamic Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2239--2247", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598595", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598595", abstract = "Real-world graphs, such as social networks, financial transactions, and recommendation systems, often demonstrate dynamic behavior. This phenomenon, known as graph stream, involves the dynamic changes of nodes and the emergence and disappearance of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zardbani:2023:AIO, author = "Fatemeh Zardbani and Nikos Mamoulis and Stratos Idreos and Panagiotis Karras", title = "Adaptive Indexing of Objects with Spatial Extent", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2248--2260", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598596", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598596", abstract = "Can we quickly explore large multidimensional data in main memory? Adaptive indexing responds to this need by building an index incrementally, in response to queries; in its default form, it indexes a single attribute or, in the presence of several \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2023:LNF, author = "Xu Chen and Haitian Chen and Zibo Liang and Shuncheng Liu and Jinghong Wang and Kai Zeng and Han Su and Kai Zheng", title = "{LEON}: a New Framework for {ML}-Aided Query Optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2261--2273", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598597", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598597", abstract = "Query optimization has long been a fundamental yet challenging topic in the database field. With the prosperity of machine learning (ML), some recent works have shown the advantages of reinforcement learning (RL) based learned query optimizer. However, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Faria:2023:TIT, author = "Nuno Faria and Jos{\'e} Pereira and Ana Nunes Alonso and Ricardo Vila{\c{c}}a and Yunus Koning and Niels Nes", title = "{TiQuE}: Improving the Transactional Performance of Analytical Systems for True Hybrid Workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2274--2288", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598598", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598598", abstract = "Transactions have been a key issue in database management for a long time and there are a plethora of architectures and algorithms to support and implement them. The current state-of-the-art is focused on storage management and is tightly coupled with \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bang:2023:SRQ, author = "Jaeho Bang and Gaurav Tarlok Kakkar and Pramod Chunduri and Subrata Mitra and Joy Arulraj", title = "{Seiden}: Revisiting Query Processing in Video Database Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2289--2301", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598599", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598599", abstract = "State-of-the-art video database management systems (VDBMSs) often use lightweight proxy models to accelerate object retrieval and aggregate queries. The key assumption underlying these systems is that the proxy model is an order of magnitude faster than \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kossmann:2023:ETL, author = "Ferdi Kossmann and Ziniu Wu and Eugenie Lai and Nesime Tatbul and Lei Cao and Tim Kraska and Sam Madden", title = "Extract-Transform-Load for Video Streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2302--2315", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598600", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598600", abstract = "Social media, self-driving cars, and traffic cameras produce video streams at large scales and cheap cost. However, storing and querying video at such scales is prohibitively expensive. We propose to treat large-scale video analytics as a data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sudhir:2023:PED, author = "Sivaprasad Sudhir and Wenbo Tao and Nikolay Laptev and Cyrille Habis and Michael Cafarella and Samuel Madden", title = "{Pando}: Enhanced Data Skipping with Logical Data Partitioning", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2316--2329", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598601", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598601", abstract = "With enormous volumes of data, quickly retrieving data that is relevant to a query is essential for achieving high performance. Modern cloud-based database systems often partition the data into blocks and employ various techniques to skip irrelevant \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Maliszewski:2023:CLJ, author = "Kajetan Maliszewski and Jorge-Arnulfo Quian{\'e}-Ruiz and Volker Markl", title = "Cracking-Like Join for Trusted Execution Environments", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2330--2343", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598602", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598602", abstract = "Data processing on non-trusted infrastructures, such as the public cloud, has become increasingly popular, despite posing risks to data privacy. However, the existing cloud DBMSs either lack sufficient privacy guarantees or underperform. In this paper, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Calikyilmaz:2023:OQA, author = "Umut {\c{C}}alikyilmaz and Sven Groppe and Jinghua Groppe and Tobias Winker and Stefan Prestel and Farida Shagieva and Daanish Arya and Florian Preis and Le Gruenwald", title = "Opportunities for Quantum Acceleration of Databases: Optimization of Queries and Transaction Schedules", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2344--2353", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598603", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598603", abstract = "The capabilities of quantum computers, such as the number of supported qubits and maximum circuit depth, have grown exponentially in recent years. Commercially relevant applications that take advantage of quantum computing are expected to be available \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Miao:2023:SSD, author = "Xupeng Miao and Yining Shi and Zhi Yang and Bin Cui and Zhihao Jia", title = "{SDPipe}: a Semi-Decentralized Framework for Heterogeneity-Aware Pipeline-parallel Training", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2354--2363", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598604", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598604", abstract = "The increasing size of both deep learning models and training data necessitates the ability to scale out model training through pipeline-parallel training, which combines pipelined model parallelism and data parallelism. However, most of them assume an \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lee:2023:LCP, author = "Bohyun Lee and Mijin An and Sang-Won Lee", title = "{LRU-C}: Parallelizing Database {I/Os} for Flash {SSDs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2364--2376", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598605", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598605", abstract = "The conventional database buffer managers have two inherent sources of I/O serialization: read stall and mutex conflict. The serialized I/O makes storage and CPU under-utilized, limiting transaction throughput and latency. Such harm stands out on flash \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2023:WYF, author = "Zixuan Chen and Panagiotis Manolios and Mirek Riedewald", title = "Why Not Yet: Fixing a Top-$k$ Ranking that is Not Fair to Individuals", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "9", pages = "2377--2390", month = may, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3598581.3598606", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:00 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3598581.3598606", abstract = "This work considers why-not questions in the context of top-k queries and score-based ranking functions. Following the popular linear scalarization approach for multi-objective optimization, we study rankings based on the weighted sum of multiple \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sharma:2023:ITS, author = "Shantanu Sharma and Yin Li and Sharad Mehrotra and Nisha Panwar and Komal Kumari and Swagnik Roychoudhury", title = "Information-Theoretically Secure and Highly Efficient Search and Row Retrieval", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2391--2403", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603582", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603582", abstract = "Information-theoretic or unconditional security provides the highest level of security --- independent of the computational capability of an adversary. Secret-sharing techniques achieve information-theoretic security by splitting a secret into multiple \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kato:2023:OOF, author = "Fumiyuki Kato and Yang Cao and Masatoshi Yoshikawa", title = "{Olive}: Oblivious Federated Learning on Trusted Execution Environment against the Risk of Sparsification", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2404--2417", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603583", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603583", abstract = "Combining Federated Learning (FL) with a Trusted Execution Environment (TEE) is a promising approach for realizing privacy-preserving FL, which has garnered significant academic attention in recent years. Implementing the TEE on the server side enables \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Luo:2023:TEF, author = "Chengyang Luo and Qing Liu and Yunjun Gao and Lu Chen and Ziheng Wei and Congcong Ge", title = "{Task}: an Efficient Framework for Instant Error-Tolerant Spatial Keyword Queries on Road Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2418--2430", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603584", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603584", abstract = "Instant spatial keyword queries return the results as soon as users type in some characters instead of a complete keyword, which allow users to query the geo-textual data in a type-as-you-search manner. However, the existing methods of instant spatial \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kassaie:2023:ACI, author = "Besat Kassaie and Frank Wm. Tompa", title = "Autonomously Computable Information Extraction", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2431--2443", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603585", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603585", abstract = "Most optimization techniques deployed in information extraction systems assume that source documents are static. Instead, extracted relations can be considered to be materialized views defined by a language built on regular expressions. Using this \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Koutsoukos:2023:NIV, author = "Dimitrios Koutsoukos and Raghav Bhartia and Michal Friedman and Ana Klimovic and Gustavo Alonso", title = "{NVM}: Is it Not Very Meaningful for Databases?", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2444--2457", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603586", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603586", abstract = "Persistent or Non Volatile Memory (PMEM) offers expanded memory capacity and faster access to persistent storage. However, there is no comprehensive empirical analysis of existing database engines under different PMEM modes, to understand how databases \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dong:2023:DJT, author = "Yuyang Dong and Chuan Xiao and Takuma Nozawa and Masafumi Enomoto and Masafumi Oyamada", title = "{DeepJoin}: Joinable Table Discovery with Pre-Trained Language Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2458--2470", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603587", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603587", abstract = "Due to the usefulness in data enrichment for data analysis tasks, joinable table discovery has become an important operation in data lake management. Existing approaches target equi-joins, the most common way of combining tables for creating a unified \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2023:FPP, author = "Yuncheng Wu and Naili Xing and Gang Chen and Tien Tuan Anh Dinh and Zhaojing Luo and Beng Chin Ooi and Xiaokui Xiao and Meihui Zhang", title = "{Falcon}: a Privacy-Preserving and Interpretable Vertical Federated Learning System", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2471--2484", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603588", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603588", abstract = "Federated learning (FL) enables multiple data owners to collaboratively train machine learning (ML) models without disclosing their raw data. In the vertical federated learning (VFL) setting, the collaborating parties have data from the same set of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gao:2023:ESE, author = "Haotian Gao and Cong Yue and Tien Tuan Anh Dinh and Zhiyong Huang and Beng Chin Ooi", title = "Enabling Secure and Efficient Data Analytics Pipeline Evolution with Trusted Execution Environment", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2485--2498", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603589", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603589", abstract = "Modern data analytics pipelines are highly dynamic, as they are constantly monitored and fine-tuned by both data engineers and scientists. Recent systems managing pipelines ease creating, deploying, and tracking their evolution. However, privacy \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Doraiswamy:2023:CGD, author = "Harish Doraiswamy and Vikas Kalagi and Karthik Ramachandra and Jayant R. Haritsa", title = "A Case for Graphics-Driven Query Processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2499--2511", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603590", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603590", abstract = "Over the past decade, the database research community has directed considerable attention towards harnessing the power of GPUs in query processing engines. The proposed techniques have primarily focused on devising customized low-level mechanisms that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tian:2023:EER, author = "Wei Tian and Jieming Shi and Siqiang Luo and Hui Li and Xike Xie and Yuanhang Zou", title = "Effective and Efficient Route Planning Using Historical Trajectories on Road Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2512--2524", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603591", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603591", abstract = "We study route planning that utilizes historical trajectories to predict a realistic route from a source to a destination on a road network at given departure time. Route planning is a fundamental task in many location-based services. It is challenging \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lampropoulos:2023:AIH, author = "Konstantinos Lampropoulos and Fatemeh Zardbani and Nikos Mamoulis and Panagiotis Karras", title = "Adaptive Indexing in High-Dimensional Metric Spaces", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2525--2537", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603592", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603592", abstract = "Similarity search in high-dimensional metric spaces is routinely used in many applications including content-based image retrieval, bioinformatics, data mining, and recommender systems. Search can be accelerated by the use of an index. However, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gao:2023:PCS, author = "Sen Gao and Hongchao Qin and Rong-Hua Li and Bingsheng He", title = "Parallel Colorful $h$-Star Core Maintenance in Dynamic Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2538--2550", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603593", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603593", abstract = "The higher-order structure cohesive subgraph mining is an important operator in many graph analysis tasks. Recently, the colorful h -star core model has been proposed as an effective alternative to h -clique based cohesive subgraph models, in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:MFM, author = "Jia Li and Wenyue Zhao and Nikos Ntarmos and Yang Cao and Peter Buneman", title = "{MITra}: a Framework for Multi-Instance Graph Traversal", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2551--2564", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603594", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603594", abstract = "This paper presents MITra, a framework for composing multi-instance graph algorithms that traverse from multiple source vertices simultaneously over a single thread. Underlying MITra is a model of multi-instance traversal that uniformly captures \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2023:CEB, author = "Jiazun Chen and Yikuan Xia and Jun Gao", title = "{CommunityAF}: an Example-Based Community Search Method via Autoregressive Flow", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2565--2577", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603595", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603595", abstract = "Example-based community search utilizes hidden patterns of given examples rather than explicit rules, reducing users' burden and enhancing flexibility. However, existing works face challenges such as low scalability, high training cost, and improper \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2023:ABA, author = "Yiming Lin and Yeye He and Surajit Chaudhuri", title = "{Auto-BI}: Automatically Build {BI}-Models Leveraging Local Join Prediction and Global Schema Graph", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2578--2590", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603596", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603596", abstract = "Business Intelligence (BI) is crucial in modern enterprises and billion-dollar business. Traditionally, technical experts like database administrators would manually prepare BI-models (e.g., in star or snowflake schemas) that join tables in data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:TDC, author = "Yuemin Zhang and Qingqing Ye and Rui Chen and Haibo Hu and Qilong Han", title = "Trajectory Data Collection with Local Differential Privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2591--2604", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603597", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603597", abstract = "Trajectory data collection is a common task with many applications in our daily lives. Analyzing trajectory data enables service providers to enhance their services, which ultimately benefits users. However, directly collecting trajectory data may give \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gao:2023:LNM, author = "Jian Gao and Xin Cao and Xin Yao and Gong Zhang and Wei Wang", title = "{LMSFC}: a Novel Multidimensional Index Based on Learned Monotonic Space Filling Curves", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2605--2617", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603598", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603598", abstract = "The recently proposed learned indexes have attracted much attention as they can adapt to the actual data and query distributions to attain better search efficiency. Based on this technique, several existing works build up indexes for multi-dimensional \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rong:2023:SDC, author = "Kexin Rong and Mihai Budiu and Athinagoras Skiadopoulos and Lalith Suresh and Amy Tai", title = "Scaling a Declarative Cluster Manager Architecture with Query Optimization Techniques", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2618--2631", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603599", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603599", abstract = "Cluster managers play a crucial role in data centers by distributing workloads among infrastructure resources. Declarative Cluster Management (DCM) is a new cluster management architecture that enables users to express placement policies declaratively \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Singh:2023:CLT, author = "Mukul Singh and Jos{\'e} Cambronero S{\'a}nchez and Sumit Gulwani and Vu Le and Carina Negreanu and Mohammad Raza and Gust Verbruggen", title = "{Cornet}: Learning Table Formatting Rules By Example", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2632--2644", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603600", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603600", abstract = "Spreadsheets are widely used for table manipulation and presentation. Stylistic formatting of these tables is an important property for presentation and analysis. As a result, popular spreadsheet software, such as Excel, supports automatically formatting tables based on rules. Unfortunately, writing such formatting rules can be challenging for users as it requires knowledge of the underlying rule language and data logic. We present Cornet, a system that tackles the novel problem of automatically learning such formatting rules from user-provided formatted cells. Cornet takes inspiration from advances in inductive programming and combines symbolic rule enumeration with a neural ranker to learn conditional formatting rules. To motivate and evaluate our approach, we extracted tables with over 450K unique formatting rules from a corpus of over 1.8M real worksheets. Since we are the first to introduce the task of automatically learning conditional formatting rules, we compare Cornet to a wide range of symbolic and neural baselines adapted from related domains. Our results show that Cornet accurately learns rules across varying setups. Additionally, we show that in some cases Cornet can find rules that are shorter than those written by users and can also discover rules in spreadsheets that users have manually formatted. Furthermore, we present two case studies investigating the generality of our approach by extending Cornet to related data tasks (e.g., filtering) and generalizing to conditional formatting over multiple columns.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zuo:2023:AAR, author = "Chaoji Zuo and Dong Deng", title = "{ARKGraph}: All-Range Approximate {$K$}-Nearest-Neighbor Graph", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2645--2658", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603601", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603601", abstract = "Given a collection of vectors, the approximate K-nearest-neighbor graph (KGraph for short) connects every vector to its approximate K-nearest-neighbors (KNN for short). KGraph plays an important role in high dimensional data visualization, semantic \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Youngmann:2023:CDI, author = "Brit Youngmann and Michael Cafarella and Babak Salimi and Anna Zeng", title = "Causal Data Integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2659--2665", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603602", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603602", abstract = "Causal inference is fundamental to empirical scientific discoveries in natural and social sciences; however, in the process of conducting causal inference, data management problems can lead to false discoveries. Two such problems are (i) not having all \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Martini:2023:MFI, author = "Michael Martini and Daniel Schuster and Wil M. P. van der Aalst", title = "Mining Frequent Infix Patterns from Concurrency-Aware Process Execution Variants", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2666--2678", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603603", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603603", abstract = "Event logs, as considered in process mining, document a large number of individual process executions. Moreover, each process execution consists of various executed activities. To cope with the vast amount of process executions in event logs, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pedreira:2023:CDM, author = "Pedro Pedreira and Orri Erling and Konstantinos Karanasos and Scott Schneider and Wes McKinney and Satya R. Valluri and Mohamed Zait and Jacques Nadeau", title = "The Composable Data Management System Manifesto", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "10", pages = "2679--2685", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3603581.3603604", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 9 10:33:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3603581.3603604", abstract = "The requirement for specialization in data management systems has evolved faster than our software development practices. After decades of organic growth, this situation has created a siloed landscape composed of hundreds of products developed and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schmitt:2023:TLS, author = "Daniel Schmitt and Daniel Kocher and Nikolaus Augsten and Willi Mann and Alexander Miller", title = "A Two-Level Signature Scheme for Stable Set Similarity Joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2686--2698", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611480", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611480", abstract = "We study the set similarity join problem, which retrieves all pairs of similar sets from two collections of sets for a given distance function. Existing exact solutions employ a signature-based filter-verification framework: If two sets are similar, \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rodriguez:2023:SRD, author = "Olivier Rodriguez and Federico Ulliana and Marie-Laure Mugnier", title = "Scalable Reasoning on Document Stores via Instance-Aware Query Rewriting", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2699--2713", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611481", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611481", abstract = "Data trees, typically encoded in JSON, are ubiquitous in data-driven applications. This ubiquity makes urgent the development of novel techniques for querying heterogeneous JSON data in a flexible manner. We propose a rule language for JSON, called \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:EVS, author = "Enhao Zhang and Maureen Daum and Dong He and Brandon Haynes and Ranjay Krishna and Magdalena Balazinska", title = "{EQUI-VOCAL}: Synthesizing Queries for Compositional Video Events from Limited User Interactions", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2714--2727", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611482", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611482", abstract = "We introduce EQUI-VOCAL: a new system that automatically synthesizes queries over videos from limited user interactions. The user only provides a handful of positive and negative examples of what they are looking for. EQUI-VOCAL utilizes these initial \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:LBG, author = "Yuhao Zhang and Arun Kumar", title = "{Lotan}: Bridging the Gap between {GNNs} and Scalable Graph Analytics Engines", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2728--2741", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611483", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611483", abstract = "Recent advances in Graph Neural Networks (GNNs) have changed the landscape of modern graph analytics. The complexity of GNN training and the scalability challenges have also sparked interest from the systems community, with efforts to build systems that \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kraft:2023:EAT, author = "Peter Kraft and Qian Li and Xinjing Zhou and Peter Bailis and Michael Stonebraker and Matei Zaharia and Xiangyao Yu", title = "{Epoxy}: {ACID} Transactions across Diverse Data Stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2742--2754", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611484", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611484", abstract = "Developers are increasingly building applications that incorporate multiple data stores, for example to manage heterogeneous data. Often, these require transactional safety for operations across stores, but few systems support such guarantees. To solve \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bother:2023:AVH, author = "Maximilian B{\"o}ther and Lawrence Benson and Ana Klimovic and Tilmann Rabl", title = "Analyzing Vectorized Hash Tables across {CPU} Architectures", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2755--2768", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611485", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611485", abstract = "Data processing systems often leverage vector instructions to achieve higher performance. When applying vector instructions, an often overlooked data structure is the hash table, even though it is fundamental in data processing systems for operations such as indexing, aggregating, and joining. In this paper, we characterize and evaluate three fundamental vectorized hashing schemes, vectorized linear probing (VLP), vectorized fingerprinting (VFP), and bucket-based comparison (BBC). We implement these hashing schemes on the x86, ARM, and Power CPU architectures, as modern database systems must provide efficient implementations for multiple platforms due to the continuously increasing hardware heterogeneity. We present various implementation variants and platform-specific optimizations, which we evaluate for integer keys, string keys, large payloads, skewed distributions, and multiple threads. Our extensive evaluation and comparison to three scalar hashing schemes on four servers shows that BBC outperforms scalar linear probing by a factor of more than 2x, while also scaling well to high load factors. We find that vectorized hashing schemes come with caveats that need to be considered, such as the increased engineering overhead, differences between CPUs, and differences between vector ISAs, such as AVX and AVX-512, which impact performance. We conclude with key findings for vectorized hashing scheme implementations.", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Durner:2023:ECO, author = "Dominik Durner and Viktor Leis and Thomas Neumann", title = "Exploiting Cloud Object Storage for High-Performance Analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2769--2782", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611486", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611486", abstract = "Elasticity of compute and storage is crucial for analytical cloud database systems. All cloud vendors provide disaggregated object stores, which can be used as storage backend for analytical query engines. Until recently, local storage was unavoidable \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Karapiperis:2023:RBS, author = "Dimitrios Karapiperis and Christos Tjortjis and Vassilios S. Verykios", title = "A Randomized Blocking Structure for Streaming Record Linkage", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2783--2791", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611487", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611487", abstract = "A huge amount of data, in terms of streams, are collected nowadays via a variety of sources, such as sensors, mobile devices, or even raw log files. The unprecedented rate at which these data are generated and collected calls for novel record linkage \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Riveros:2023:RNR, author = "Cristian Riveros and Nicol{\'a}s {Van Sint Jan} and Domagoj Vrgoc", title = "{REmatch}: a Novel Regex Engine for Finding All Matches", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2792--2804", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611488", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611488", abstract = "In this paper, we present the REmatch system for information extraction. REmatch is based on a recently proposed enumeration algorithm for evaluating regular expressions with capture variables supporting the all-match semantics. It tells a story of what it takes to make a theoretically optimal algorithm work in practice. As we show here, a naive implementation of the original algorithm would have a hard time dealing with realistic workloads. We thus develop a new algorithm and a series of optimizations that make REmatch as fast or faster than many popular RegEx engines while at the same time being able to return all the outputs: a task that most other engines tend to struggle with.", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2023:AAO, author = "Junxiong Wang and Immanuel Trummer and Ahmet Kara and Dan Olteanu", title = "{ADOPT}: Adaptively Optimizing Attribute Orders for Worst-Case Optimal Join Algorithms via Reinforcement Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2805--2817", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611489", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611489", abstract = "The performance of worst-case optimal join algorithms depends on the order in which the join attributes are processed. Selecting good orders before query execution is hard, due to the large space of possible orders and unreliable execution cost \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hu:2023:TSM, author = "Zheng Hu and Weiguo Zheng and Xiang Lian", title = "Triangular Stability Maximization by Influence Spread over Social Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2818--2831", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611490", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611490", abstract = "In many real-world applications such as social network analysis and online advertising/marketing, one of the most important and popular problems is called influence maximization (IM), which finds a set of k seed users that maximize the expected number \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Guan:2023:CSE, author = "Haoquan Guan and Ziling Chen and Shaoxu Song", title = "{CORE-Sketch}: On Exact Computation of Median Absolute Deviation with Limited Space", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2832--2844", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611491", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611491", abstract = "Median absolute deviation (MAD), the median of the absolute deviations from the median, has been found useful in various applications such as outlier detection. Together with median, MAD is more robust to abnormal data than mean and standard deviation \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lulf:2023:FSC, author = "Christian L{\"u}lf and Denis Mayr Lima Martins and Marcos Antonio Vaz Salles and Yongluan Zhou and Fabian Gieseke", title = "Fast Search-by-Classification for Large-Scale Databases Using Index-Aware Decision Trees and Random Forests", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2845--2857", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611492", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611492", abstract = "The vast amounts of data collected in various domains pose great challenges to modern data exploration and analysis. To find ``interesting'' objects in large databases, users typically define a query using positive and negative example objects and train a \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Calautti:2023:SOC, author = "Marco Calautti and Mostafa Milani and Andreas Pieris", title = "Semi-Oblivious Chase Termination for Linear Existential Rules: an Experimental Studya", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2858--2870", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611493", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611493", abstract = "The chase procedure is a fundamental algorithmic tool in databases that allows us to reason with constraints, such as existential rules, with a plethora of applications. It takes as input a database and a set of constraints, and iteratively completes \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lee:2023:AIC, author = "Kukjin Lee and Anshuman Dutt and Vivek Narasayya and Surajit Chaudhuri", title = "Analyzing the Impact of Cardinality Estimation on Execution Plans in {Microsoft SQL} Server", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2871--2883", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611494", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611494", abstract = "Cardinality estimation is widely believed to be one of the most important causes of poor query plans. Prior studies evaluate the impact of cardinality estimation on plan quality on a set of Select-Project-Join queries on PostgreSQL DBMS. Our empirical \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lee:2023:WLZ, author = "Jongsung Lee and Donguk Kim and Jae W. Lee", title = "{WALTZ}: Leveraging Zone Append to Tighten the Tail Latency of {LSM} Tree on {ZNS SSD}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2884--2896", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611495", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611495", abstract = "We propose WALTZ, an LSM tree-based key-value store on the emerging Zoned Namespace (ZNS) SSD. The key contribution of WALTZ is to leverage the zone append command, which is a recent addition to ZNS SSD specifications, to provide tight tail latency. The \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Russo:2023:AAQ, author = "Matthew Russo and Tatsunori Hashimoto and Daniel Kang and Yi Sun and Matei Zaharia", title = "Accelerating Aggregation Queries on Unstructured Streams of Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2897--2910", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611496", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611496", abstract = "Analysts and scientists are interested in querying streams of video, audio, and text to extract quantitative insights. For example, an urban planner may wish to measure congestion by querying the live feed from a traffic camera. Prior work has used deep \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bai:2023:QIS, author = "Qiushi Bai and Sadeem Alsudais and Chen Li", title = "{QueryBooster}: Improving {SQL} Performance Using Middleware Services for Human-Centered Query Rewriting", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2911--2924", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611497", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611497", abstract = "SQL query performance is critical in database applications, and query rewriting is a technique that transforms an original query into an equivalent query with a better performance. In a wide range of database-supported systems, there is a unique problem \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2023:CRA, author = "Jiongli Zhu and Sainyam Galhotra and Nazanin Sabri and Babak Salimi", title = "Consistent Range Approximation for Fair Predictive Modeling", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2925--2938", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611498", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611498", abstract = "This paper proposes a novel framework for certifying the fairness of predictive models trained on biased data. It draws from query answering for incomplete and inconsistent databases to formulate the problem of consistent range approximation (CRA) of \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yin:2023:SMW, author = "Haoteng Yin and Muhan Zhang and Jianguo Wang and Pan Li", title = "{SUREL+}: Moving from Walks to Sets for Scalable Subgraph-Based Graph Representation Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2939--2948", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611499", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611499", abstract = "Subgraph-based graph representation learning (SGRL) has recently emerged as a powerful tool in many prediction tasks on graphs due to its advantages in model expressiveness and generalization ability. Most previous SGRL models face computational issues \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2023:ESN, author = "Hanzhi Wang and Zhewei Wei", title = "Estimating Single-Node {PageRank} in {$ \tilde {O}(\min d_t, \sqrt {m}) $} Time", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2949--2961", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611500", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611500", abstract = "PageRank is a famous measure of graph centrality that has numerous applications in practice. The problem of computing a single node's PageRank has been the subject of extensive research over a decade. However, existing methods still incur large time complexities despite years of efforts. Even on undirected graphs where several valuable properties held by PageRank scores, the problem of locally approximating the PageRank score of a target node remains a challenging task. Two commonly adopted techniques, Monte-Carlo based random walks and backward push, both cost $ O(n) $ time in the worst-case scenario, which hinders existing methods from achieving a sublinear time complexity like $ O(\sqrt {m}) $ on an undirected graph with $n$ nodes and $m$ edges.\par In this paper, we focus on the problem of single-node PageRank computation on undirected graphs. We propose a novel algorithm, SetPush, for estimating single-node PageRank specifically on undirected graphs. With non-trivial analysis, we prove that our SetPush achieves the $ \tilde {O}(\min (d_, \sqrt {m}))$ time complexity for estimating the target node $t$'s PageRank with constant relative error and constant failure probability on undirected graphs. We conduct comprehensive experiments to demonstrate the effectiveness of SetPush.", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:SAQ, author = "Yunjia Zhang and Yannis Chronis and Jignesh M. Patel and Theodoros Rekatsinas", title = "Simple Adaptive Query Processing vs. Learned Query Optimizers: Observations and Analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2962--2975", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611501", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611501", abstract = "There have been many decades of work on optimizing query processing in database management systems. Recently, modern machine learning (ML), and specifically reinforcement learning (RL), has gained increased attention as a means to develop a query \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xu:2023:BTO, author = "Helen Xu and Amanda Li and Brian Wheatman and Manoj Marneni and Prashant Pandey", title = "{BP-Tree}: Overcoming the Point-Range Operation Tradeoff for In-Memory {B}-Trees", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2976--2989", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611502", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611502", abstract = "B-trees are the go-to data structure for in-memory indexes in databases and storage systems. B-trees support both point operations (i.e., inserts and finds) and range operations (i.e., iterators and maps). However, there is an inherent tradeoff between \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lv:2023:HXT, author = "Ge Lv and Chen Jason Zhang and Lei Chen", title = "{HENCE-X}: Toward Heterogeneity-Agnostic Multi-Level Explainability for Deep Graph Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "2990--3003", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611503", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611503", abstract = "Deep graph networks (DGNs) have demonstrated their outstanding effectiveness on both heterogeneous and homogeneous graphs. However their black-box nature does not allow human users to understand their working mechanisms. Recently, extensive efforts have \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yuan:2023:ARE, author = "Haitao Yuan and Sai Wang and Zhifeng Bao and Shangguang Wang", title = "Automatic Road Extraction with Multi-Source Data Revisited: Completeness, Smoothness and Discrimination", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3004--3017", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611504", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611504", abstract = "Extracting roads from multi-source data, such as aerial images and vehicle trajectories, is an important way to maintain road networks in the filed of urban computing. In this paper, we revisit the problem of road extraction and aim to boost its \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fent:2023:ABQ, author = "Philipp Fent and Guido Moerkotte and Thomas Neumann", title = "Asymptotically Better Query Optimization Using Indexed Algebra", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3018--3030", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611505", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611505", abstract = "Query optimization is essential for the efficient execution of queries. The necessary analysis, if we can and should apply optimizations and transform the query plan, is already challenging. Traditional techniques focus on the availability of columns at \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Skavantzos:2023:NPG, author = "Philipp Skavantzos and Sebastian Link", title = "Normalizing Property Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3031--3043", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611506", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611506", abstract = "Normalization aims at minimizing sources of potential data inconsistency and costs of update maintenance incurred by data redundancy. For relational databases, different classes of dependencies cause data redundancy and have resulted in proposals such \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2023:DDC, author = "Chunwei Liu and Anna Pavlenko and Matteo Interlandi and Brandon Haynes", title = "A Deep Dive into Common Open Formats for Analytical {DBMSs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3044--3056", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611507", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611507", abstract = "This paper evaluates the suitability of Apache Arrow, Parquet, and ORC as formats for subsumption in an analytical DBMS. We systematically identify and explore the high-level features that are important to support efficient querying in modern OLAP DBMSs \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2023:SDP, author = "Zezhou Huang and Jiaxiang Liu and Daniel Gbenga Alabi and Raul Castro Fernandez and Eugene Wu", title = "{Saibot}: a Differentially Private Data Search Platform", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3057--3070", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611508", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611508", abstract = "Recent data search platforms use ML task-based utility measures rather than metadata-based keywords, to search large dataset corpora. Requesters submit a training dataset, and these platforms search for augmentations ---join or union-compatible datasets--. \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2023:JGT, author = "Zezhou Huang and Rathijit Sen and Jiaxiang Liu and Eugene Wu", title = "{JoinBoost}: Grow Trees over Normalized Data Using Only {SQL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3071--3084", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611509", ISSN = "2150-8097", bibdate = "Fri Aug 25 07:25:43 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611479.3611509", abstract = "Although dominant for tabular data, ML libraries that train tree models over normalized databases (e.g., LightGBM, XGBoost) require the data to be denormalized as a single table, materialized, and exported. This process is not scalable, slow, and poses \ldots{}", acknowledgement = ack-nhfb, ajournal = "", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:RRR, author = "Qian Li and Peter Kraft and Michael Cafarella and {\c{C}}agatay Demiralp and Goetz Graefe and Christos Kozyrakis and Michael Stonebraker and Lalith Suresh and Xiangyao Yu and Matei Zaharia", title = "{R$^3$}: Record-Replay-Retroaction for Database-Backed Applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3085--3097", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611510", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Developers would benefit greatly from time travel: being able to faithfully replay past executions and retroactively execute modified code on past events. Currently, replay and retroaction are impractical because they require expensively capturing fine-... \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sarkhel:2023:STL, author = "Ritesh Sarkhel and Binxuan Huang and Colin Lockard and Prashant Shiralkar", title = "Self-Training for Label-Efficient Information Extraction from Semi-Structured {Web}-Pages", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3098--3110", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611511", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Information Extraction (IE) from semi-structured web-pages is a long studied problem. Training a model for this extraction task requires a large number of human-labeled samples. Prior works have proposed transferable models to improve the label-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jin:2023:ENL, author = "Jiabao Jin and Peng Cheng and Lei Chen and Xuemin Lin and Wenjie Zhang", title = "Efficient Non-Learning Similar Subtrajectory Search", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3111--3123", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611512", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Similar subtrajectory search is a finer-grained operator that can better capture the similarities between one query trajectory and a portion of a data trajectory than the traditional similar trajectory search, which requires that the two checking \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cao:2023:FRA, author = "Xinle Cao and Jian Liu and Yongsheng Shen and Xiaohua Ye and Kui Ren", title = "Frequency-Revealing Attacks against Frequency-Hiding Order-Preserving Encryption", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3124--3136", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611513", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Order-preserving encryption (OPE) allows efficient comparison operations over encrypted data and thus is popular in encrypted databases. However, most existing OPE schemes are vulnerable to inference attacks as they leak plaintext frequency. To this end,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:EFT, author = "Tianyu Zhang and Kaige Liu and Jack Kosaian and Juncheng Yang and Rashmi Vinayak", title = "Efficient Fault Tolerance for Recommendation Model Training via Erasure Coding", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3137--3150", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611514", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Deep-learning-based recommendation models (DLRMs) are widely deployed to serve personalized content. In addition to using neural networks, DLRMs have large, sparsely-accessed embedding tables, which map categorical features to a learned dense \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dong:2023:SWQ, author = "Rui Dong and Jie Liu and Yuxuan Zhu and Cong Yan and Barzan Mozafari and Xinyu Wang", title = "{SlabCity}: Whole-Query Optimization Using Program Synthesis", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3151--3164", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611515", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query rewriting is often a prerequisite for effective query optimization, particularly for poorly-written queries. Prior work on query rewriting has relied on a set of ``rules'' based on syntactic pattern-matching. Whether relying on manual rules or auto-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Howie:2023:SSC, author = "Joseph Howie and Venkatesh Srinivasan and Alex Thomo", title = "Scaling Up Structural Clustering to Large Probabilistic Graphs Using {Lyapunov} Central Limit Theorem", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3165--3177", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611516", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Structural clustering is one of the most widely used graph clustering frameworks. In this paper, we focus on structural clustering of probabilistic graphs, which comes with significant computational challenges and has, so far, resisted efficient \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rosenblatt:2023:EPR, author = "Lucas Rosenblatt and Bernease Herman and Anastasia Holovenko and Wonkwon Lee and Joshua Loftus and Elizabeth McKinnie and Taras Rumezhak and Andrii Stadnik and Bill Howe and Julia Stoyanovich", title = "Epistemic Parity: Reproducibility as an Evaluation Metric for Differential Privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3178--3191", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611517", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Differential privacy (DP) data synthesizers are increasingly proposed to afford public release of sensitive information, offering theoretical guarantees for privacy (and, in some cases, utility), but limited empirical evidence of utility in practical \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dadvar:2023:PPO, author = "Vargha Dadvar and Lukasz Golab and Divesh Srivastava", title = "{POEM}: Pattern-Oriented Explanations of Convolutional Neural Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3192--3200", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611518", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Convolutional Neural Networks (CNNs) are commonly used in computer vision. However, their predictions are difficult to explain, as is the case with many deep learning models. To address this problem, we present POEM, a modular framework that produces \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2023:GEC, author = "Dandan Liu and Zhaonian Zou", title = "{gCore}: Exploring Cross-Layer Cohesiveness in Multi-Layer Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3201--3213", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611519", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As multi-layer graphs can give a more accurate and reliable picture of the complex relationships between entities, cohesive subgraph mining, a fundamental task in graph analysis, has been studied on multi-layer graphs in the literature. However, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tong:2023:SLD, author = "Yulai Tong and Jiazhen Liu and Hua Wang and Ke Zhou and Rongfeng He and Qin Zhang and Cheng Wang", title = "{Sieve}: a Learned Data-Skipping Index for Data Analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3214--3226", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611520", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern data analytics services are coupled with external data storage services, making I/O from remote cloud storage one of the dominant costs for query processing. Techniques such as columnar block-based data organization and compression have become \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tangwongsan:2023:OSW, author = "Kanat Tangwongsan and Martin Hirzel and Scott Schneider", title = "Out-of-Order Sliding-Window Aggregation with Efficient Bulk Evictions and Insertions", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3227--3239", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611521", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Sliding-window aggregation is a foundational stream processing primitive that efficiently summarizes recent data. The state-of-the-art algorithms for sliding-window aggregation are highly efficient when stream data items are evicted or inserted one at a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2023:BES, author = "Siyuan Wu and Leong Hou U. and Panagiotis Karras", title = "$k$-Best Egalitarian Stable Marriages for Task Assignment", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3240--3252", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611522", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In a two-sided market with each agent ranking individuals on the other side according to their preferences, such as location or incentive, the stable marriage problem calls to find a perfect matching among the two sides such that no pair of agents \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cormode:2023:FCE, author = "Graham Cormode and Igor L. Markov", title = "Federated Calibration and Evaluation of Binary Classifiers", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3253--3265", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611523", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We address two major obstacles to practical deployment of AI-based models on distributed private data. Whether a model was trained by a federation of cooperating clients or trained centrally, (1) the output scores must be calibrated, and (2) performance \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Park:2023:FDF, author = "Jonghyeok Park and Soyee Choi and Gihwan Oh and Soojun Im and Moon-Wook Oh and Sang-Won Lee", title = "{FlashAlloc}: Dedicating Flash Blocks by Objects", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3266--3278", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611524", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "For a write request, today's flash storage cannot distinguish the logical object it comes from ( e.g., SSTables in RocksDB). In such object-oblivious flash devices, concurrent writes from different objects are simply packed in their arrival order to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shahbazi:2023:TFL, author = "Nima Shahbazi and Nikola Danevski and Fatemeh Nargesian and Abolfazl Asudeh and Divesh Srivastava", title = "Through the Fairness Lens: Experimental Analysis and Evaluation of Entity Matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3279--3292", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611525", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Entity matching (EM) is a challenging problem studied by different communities for over half a century. Algorithmic fairness has also become a timely topic to address machine bias and its societal impacts. Despite extensive research on these two topics, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kraska:2023:CBB, author = "Tim Kraska and Tianyu Li and Samuel Madden and Markos Markakis and Amadou Ngom and Ziniu Wu and Geoffrey X. Yu", title = "Check Out the Big Brain on {BRAD}: Simplifying Cloud Data Processing with Learned Automated Data Meshes", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3293--3301", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611526", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The last decade of database research has led to the prevalence of specialized systems for different workloads. Consequently, organizations often rely on a combination of specialized systems, organized in a Data Mesh. Data meshes present significant \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fernandez:2023:HLL, author = "Raul Castro Fernandez and Aaron J. Elmore and Michael J. Franklin and Sanjay Krishnan and Chenhao Tan", title = "How Large Language Models Will Disrupt Data Management", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3302--3309", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611527", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large language models (LLMs), such as GPT-4, are revolutionizing software's ability to understand, process, and synthesize language. The authors of this paper believe that this advance in technology is significant enough to prompt introspection in the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Woltmann:2023:FML, author = "Lucas Woltmann and Jerome Thiessat and Claudio Hartmann and Dirk Habich and Wolfgang Lehner", title = "{FASTgres}: Making Learned Query Optimizer Hinting Effective", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3310--3322", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611528", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The traditional and well-established cost-based query optimizer approach enumerates different execution plans for each query, assesses each plan with costs, and selects the plan that promises the lowest costs for execution. However, the optimal \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Vohringer:2023:WAT, author = "Demian V{\"o}hringer and Viktor Leis", title = "Write-Aware Timestamp Tracking: Effective and Efficient Page Replacement for Modern Hardware", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3323--3334", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611529", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we revisit the classical data management problem of page replacement. We propose Write-Aware Timestamp Tracking (WATT), a novel replacement algorithm that is optimized for modern hardware. By explicitly tracking the access history of each \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Butrovich:2023:TDP, author = "Matthew Butrovich and Karthik Ramanathan and John Rollinson and Wan Shen Lim and William Zhang and Justine Sherry and Andrew Pavlo", title = "{Tigger}: a Database Proxy That Bounces with User-Bypass", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3335--3348", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611530", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Developers often deploy database-specific network proxies whereby applications connect transparently to the proxy instead of directly connecting to the database management system (DBMS). This indirection improves system performance through connection \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xia:2023:EDV, author = "Haocheng Xia and Jinfei Liu and Jian Lou and Zhan Qin and Kui Ren and Yang Cao and Li Xiong", title = "Equitable Data Valuation Meets the Right to Be Forgotten in Model Markets", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3349--3362", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611531", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The increasing demand for data-driven machine learning (ML) models has led to the emergence of model markets, where a broker collects personal data from data owners to produce high-usability ML models. To incentivize data owners to share their data, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Khelifati:2023:TBB, author = "Abdelouahab Khelifati and Mourad Khayati and Anton Dign{\"o}s and Djellel Difallah and Philippe Cudr{\'e}-Mauroux", title = "{TSM-Bench}: Benchmarking Time Series Database Systems for Monitoring Applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3363--3376", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611532", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Time series databases are essential for the large-scale deployment of many critical industrial applications. In infrastructure monitoring, for instance, a database system should be able to process large amounts of sensor data in real-time, execute \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Eltabakh:2023:CMD, author = "Mohamed Y. Eltabakh and Mayuresh Kunjir and Ahmed K. Elmagarmid and Mohammad Shahmeer Ahmad", title = "Cross Modal Data Discovery over Structured and Unstructured Data Lakes", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3377--3390", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611533", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Organizations are collecting increasingly large amounts of data for data-driven decision making. These data are often dumped into a centralized repository, e.g., a data lake, consisting of thousands of structured and unstructured datasets. Perversely, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:ATS, author = "Peng Li and Yeye He and Cong Yan and Yue Wang and Surajit Chaudhuri", title = "{Auto-Tables}: Synthesizing Multi-Step Transformations to Relationalize Tables without Using Examples", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3391--3403", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611534", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Relational tables, where each row corresponds to an entity and each column corresponds to an attribute, have been the standard for tables in relational databases. However, such a standard cannot be taken for granted when dealing with tables ``in the wild''. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Buss:2023:EEA, author = "Christopher Buss and Jasmin Mousavi and Mikhail Tokarev and Arash Termehchy and David Maier and Stefan Lee", title = "Effective Entity Augmentation by Querying External Data Sources", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3404--3417", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611535", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Users often want to augment and enrich entities in their datasets with relevant information from external data sources. As many external sources are accessible only via keyword-search interfaces, a user usually has to manually formulate a keyword query \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sylligardos:2023:CWE, author = "Emmanouil Sylligardos and Paul Boniol and John Paparrizos and Panos Trahanias and Themis Palpanas", title = "Choose Wisely: an Extensive Evaluation of Model Selection for Anomaly Detection in Time Series", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3418--3432", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611536", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Anomaly detection is a fundamental task for time-series analytics with important implications for the downstream performance of many applications. Despite increasing academic interest and the large number of methods proposed in the literature, recent \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Aguerrebere:2023:SSB, author = "Cecilia Aguerrebere and Ishwar Singh Bhati and Mark Hildebrand and Mariano Tepper and Theodore Willke", title = "Similarity Search in the Blink of an Eye with Compressed Indices", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3433--3446", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611537", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Nowadays, data is represented by vectors. Retrieving those vectors, among millions and billions, that are similar to a given query is a ubiquitous problem, known as similarity search, of relevance for a wide range of applications. Graph-based indices \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lv:2023:DAG, author = "Ge Lv and Lei Chen", title = "On Data-Aware Global Explainability of Graph Neural Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3447--3460", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611538", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph Neural Networks (GNNs) have significantly boosted the performance of many graph-based applications, yet they serve as black-box models. To understand how GNNs make decisions, explainability techniques have been extensively studied. While the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jungmair:2023:DSO, author = "Michael Jungmair and Jana Giceva", title = "Declarative Sub-Operators for Universal Data Processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "11", pages = "3461--3474", month = jul, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611479.3611539", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:33:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data processing systems face the challenge of supporting increasingly diverse workloads efficiently. At the same time, they are already bloated with internal complexity, and it is not clear how new hardware can be supported sustainably. In this paper, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tatemura:2023:PPP, author = "Junichi Tatemura and Tao Zou and Jagan Sankaranarayanan and Yanlai Huang and Jim Chen and Yupu Zhang and Kevin Lai and Hao Zhang and Gokul Nath Babu Manoharan and Goetz Graefe and Divyakant Agrawal and Brad Adelberg and Shilpa Kolhar and Indrajit Roy", title = "Progressive Partitioning for Parallelized Query Execution in {Google}'s {Napa}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3475--3487", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611541", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611541", abstract = "Napa holds Google's critical data warehouses in log-structured merge trees for real-time data ingestion and sub-second response for billions of queries per day. These \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Depoutovitch:2023:TMB, author = "Alex Depoutovitch and Chong Chen and Per-Ake Larson and Jack Ng and Shu Lin and Guanzhu Xiong and Paul Lee and Emad Boctor and Samiao Ren and Lengdong Wu and Yuchen Zhang and Calvin Sun", title = "{Taurus MM}: Bringing Multi-Master to the Cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3488--3500", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611542", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611542", abstract = "A single-master database has limited update capacity because a single node handles all updates. A multi-master database potentially has higher update \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mao:2023:SCN, author = "Yancan Mao and Zhanghao Chen and Yifan Zhang and Meng Wang and Yong Fang and Guanghui Zhang and Rui Shi and Richard T. B. Ma", title = "{StreamOps}: Cloud-Native Runtime Management for Streaming Services in {ByteDance}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3501--3514", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611543", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611543", abstract = "Stream processing is widely used for real-time data processing and decision-making, leading to tens of thousands of streaming jobs deployed in ByteDance cloud. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Anneser:2023:ALQ, author = "Christoph Anneser and Nesime Tatbul and David Cohen and Zhenggang Xu and Prithviraj Pandian and Nikolay Laptev and Ryan Marcus", title = "{AutoSteer}: Learned Query Optimization for Any {SQL} Database", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3515--3527", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611544", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611544", abstract = "This paper presents AutoSteer, a learning-based solution that automatically drives query optimization in any SQL database that exposes tunable optimizer knobs. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2023:KRT, author = "Jianjun Chen and Rui Shi and Heng Chen and Li Zhang and Ruidong Li and Wei Ding and Liya Fan and Hao Wang and Mu Xiong and Yuxiang Chen and Benchao Dong and Kuankuan Guo and Yuanjin Lin and Xiao Liu and Haiyang Shi and Peipei Wang and Zikang Wang and Yemeng Yang and Junda Zhao and Dongyan Zhou and Zhikai Zuo and Yuming Liang", title = "{Krypton}: Real-Time Serving and Analytical {SQL} Engine at {ByteDance}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3528--3542", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611545", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611545", abstract = "In recent years, at ByteDance, we have started seeing more and more business scenarios that require performing real-time data serving besides complex Ad \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zou:2023:EVE, author = "Yuanhang Zou and Zhihao Ding and Jieming Shi and Shuting Guo and Chunchen Su and Yafei Zhang", title = "{EmbedX}: a Versatile, Efficient and Scalable Platform to Embed Both Graphs and High-Dimensional Sparse Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3543--3556", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611546", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611546", abstract = "In modern online services, it is of growing importance to process web-scale graph data and high-dimensional sparse data together into embeddings for downstream \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Saxena:2023:SAG, author = "Mohit Saxena and Benjamin Sowell and Daiyan Alamgir and Nitin Bahadur and Bijay Bisht and Santosh Chandrachood and Chitti Keswani and G. Krishnamoorthy and Austin Lee and Bohou Li and Zach Mitchell and Vaibhav Porwal and Maheedhar Reddy Chappidi and Brian Ross and Noritaka Sekiyama and Omer Zaki and Linchi Zhang and Mehul A. Shah", title = "The Story of {AWS Glue}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3557--3569", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611547", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611547", abstract = "AWS Glue is Amazon's serverless data integration cloud service that makes it simple and cost effective to extract, clean, enrich, load, and organize data. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:TGE, author = "Yang Li and Huaijun Jiang and Yu Shen and Yide Fang and Xiaofeng Yang and Danqing Huang and Xinyi Zhang and Wentao Zhang and Ce Zhang and Peng Chen and Bin Cui", title = "Towards General and Efficient Online Tuning for {Spark}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3570--3583", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611548", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611548", abstract = "The distributed data analytic system --- Spark is a common choice for processing massive volumes of heterogeneous data, while it is challenging to tune its parameters \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:CBP, author = "Jiashu Zhang and Wen Jiang and Bo Tang and Haoxiang Ma and Lixun Cao and Zhongbin Jiang and Yuanyuan Nie and Fan Wang and Lei Zhang and Yuming Liang", title = "{CDSBen}: Benchmarking the Performance of Storage Services in Cloud-Native Database System at {ByteDance}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3584--3596", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611549", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611549", abstract = "In this work, we focus on the performance benchmarking problem of storage services in cloud-native database systems, which are widely used in various cloud \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2023:FBR, author = "Xuanhe Zhou and Cheng Chen and Kunyi Li and Bingsheng He and Mian Lu and Qiaosheng Liu and Wei Huang and Guoliang Li and Zhao Zheng and Yuqiang Chen", title = "{FEBench}: a Benchmark for Real-Time Relational Data Feature Extraction", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3597--3609", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611550", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611550", abstract = "As the use of online AI inference services rapidly expands in various applications (e.g., fraud detection in banking, product recommendation in e-commerce), \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xiao:2023:MDF, author = "Fei Xiao and Yuncheng Wu and Meihui Zhang and Gang Chen and Beng Chin Ooi", title = "{MINT}: Detecting Fraudulent Behaviors from Time-Series Relational Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3610--3623", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611551", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611551", abstract = "The e-commerce platforms, such as Shopee, have accumulated a huge volume of time-series relational data, which contains useful information on differentiating \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ahmad:2023:MPS, author = "Shafi Ahmad and Dillidorai Arumugam and Srdan Bozovic and Elnata Degefa and Sailesh Duvvuri and Steven Gott and Nitish Gupta and Joachim Hammer and Nivedita Kaluskar and Raghav Kaushik and Rakesh Khanduja and Prasad Mujumdar and Gaurav Malhotra and Pankaj Naik and Nikolas Ogg and Krishna Kumar Parthasarthy and Raghu Ramakrishnan and Vlad Rodriguez and Rahul Sharma and Jakub Szymaszek and Andreas Wolter", title = "{Microsoft Purview}: a System for Central Governance of Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3624--3635", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611552", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611552", abstract = "Modern data estates are spread across data located on premises, on the edge and in one or more public clouds, spread across various sources like multiple relational \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2023:AAI, author = "Liang Lin and Yuhan Li and Bin Wu and Huijun Mai and Renjie Lou and Jian Tan and Feifei Li", title = "{Anser}: Adaptive Information Sharing Framework of {AnalyticDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3636--3648", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611553", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611553", abstract = "The surge in data analytics has fostered burgeoning demand for AnalyticDB on Alibaba Cloud, which has well served thousands of customers from \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Brucke:2023:TAI, author = "Christoph Br{\"u}cke and Philipp H{\"a}rtling and Rodrigo D Escobar Palacios and Hamesh Patel and Tilmann Rabl", title = "{TPCx-AI} --- An Industry Standard Benchmark for Artificial Intelligence and Machine Learning Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3649--3661", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611554", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611554", abstract = "Artificial intelligence (AI) and machine learning (ML) techniques have existed for years, but new hardware trends and advances in model training and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Psallidas:2023:OEE, author = "Fotis Psallidas and Ashvin Agrawal and Chandru Sugunan and Khaled Ibrahim and Konstantinos Karanasos and Jes{\'u}s Camacho-Rodr{\'{\i}}guez and Avrilia Floratou and Carlo Curino and Raghu Ramakrishnan", title = "{OneProvenance}: Efficient Extraction of Dynamic Coarse-Grained Provenance from Database Query Event Logs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3662--3675", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611555", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611555", abstract = "Provenance encodes information that connects datasets, their generation workflows, and associated metadata (e.g., who or when executed a query). As \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Srinivasan:2023:TEB, author = "V. Srinivasan and Andrew Gooding and Sunil Sayyaparaju and Thomas Lopatic and Kevin Porter and Ashish Shinde and B. Narendran", title = "Techniques and Efficiencies from Building a Real-Time {DBMS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3676--3688", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611556", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611556", abstract = "This paper describes a variety of techniques from over a decade of developing Aerospike (formerly Citrusleaf), a real-time DBMS that is being used in some of the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2023:RTW, author = "Jiaqi Wang and Tianyi Li and Anni Wang and Xiaoze Liu and Lu Chen and Jie Chen and Jianye Liu and Junyang Wu and Feifei Li and Yunjun Gao", title = "Real-Time Workload Pattern Analysis for Large-Scale Cloud Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3689--3701", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611557", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611557", abstract = "Hosting database services on cloud systems has become a common practice. This has led to the increasing volume of database workloads, which provides the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:BDA, author = "Jiang Li and Qi Xie and Yan Ma and Jian Ma and Kunshang Ji and Yizhong Zhang and Chaojun Zhang and Yixiu Chen and Gangsheng Wu and Jie Zhang and Kaidi Yang and Xinyi He and Qiuyang Shen and Yanting Tao and Haiwei Zhao and Penghui Jiao and Chengfei Zhu and David Qian and Cheng Xu", title = "Big Data Analytic Toolkit: a General-Purpose, Modular, and Heterogeneous Acceleration Toolkit for Data Analytical Engines", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3702--3714", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611558", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611558", abstract = "Query compilation and hardware acceleration are important technologies for optimizing the performance of data processing engines. There have been many works on \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shen:2023:LTC, author = "Chunhui Shen and Qianyu Ouyang and Feibo Li and Zhipeng Liu and Longcheng Zhu and Yujie Zou and Qing Su and Tianhuan Yu and Yi Yi and Jianhong Hu and Cen Zheng and Bo Wen and Hanbang Zheng and Lunfan Xu and Sicheng Pan and Bin Wu and Xiao He and Ye Li and Jian Tan and Sheng Wang and Dan Pei and Wei Zhang and Feifei Li", title = "{Lindorm TSDB}: a Cloud-Native Time-Series Database for Large-Scale Monitoring Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3715--3727", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611559", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611559", abstract = "Internet services supported by large-scale distributed systems have become essential for our daily life. To ensure the stability and high quality of services, diverse \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2023:OPH, author = "Zhifeng Yang and Quanqing Xu and Shanyan Gao and Chuanhui Yang and Guoping Wang and Yuzhong Zhao and Fanyu Kong and Hao Liu and Wanhong Wang and Jinliang Xiao", title = "{OceanBase Paetica}: a Hybrid Shared-Nothing\slash Shared-Everything Database for Supporting Single Machine and Distributed Cluster", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3728--3740", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611560", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611560", abstract = "In the ongoing evolution of the OceanBase database system, it is essential to enhance its adaptability to small-scale enterprises. The OceanBase database system \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yao:2023:SEU, author = "Yuanyuan Yao and Dimeng Li and Hailiang Jie and Hailiang Jie and Tianyi Li and Jie Chen and Jiaqi Wang and Feifei Li and Yunjun Gao", title = "{SimpleTS}: an Efficient and Universal Model Selection Framework for Time Series Forecasting", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3741--3753", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611561", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611561", abstract = "Time series forecasting, that predicts events through a sequence of time, has received increasing attention in past decades. The diverse range of time series \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2023:PSC, author = "Xinjun Yang and Yingqiang Zhang and Hao Chen and Chuan Sun and Feifei Li and Wenchao Zhou", title = "{PolarDB-SCC}: a Cloud-Native Database Ensuring Low Latency for Strongly Consistent Reads", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3754--3767", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611562", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611562", abstract = "A classic design of cloud-native databases adopts an architecture that consists of one read/write (RW) node and one or more read-only (RO) nodes. In such a design, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yamada:2023:SUT, author = "Hiroyuki Yamada and Toshihiro Suzuki and Yuji Ito and Jun Nemoto", title = "{ScalarDB}: Universal Transaction Manager for Polystores", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3768--3780", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611563", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611563", abstract = "This paper presents ScalarDB, a universal transaction manager that achieves distributed transactions across multiple disparate databases. ScalarDB \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Nie:2023:APS, author = "Xiaonan Nie and Yi Liu and Fangcheng Fu and Jinbao Xue and Dian Jiao and Xupeng Miao and Yangyu Tao and Bin Cui", title = "{Angel-PTM}: a Scalable and Economical Large-Scale Pre-Training System in {Tencent}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3781--3794", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611564", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611564", abstract = "Recent years have witnessed the unprecedented achievements of large-scale pre-trained models, especially Transformer models. Many products and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:EEE, author = "Ji You Li and Jiachi Zhang and Wenchao Zhou and Yuhang Liu and Shuai Zhang and Zhuoming Xue and Ding Xu and Hua Fan and Fangyuan Zhou and Feifei Li", title = "{Eigen}: End-to-End Resource Optimization for Large-Scale Databases on the Cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3795--3807", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611565", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611565", abstract = "Increasingly, cloud database vendors host large-scale geographically distributed clusters to provide cloud database services. When managing the clusters, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pan:2023:MUA, author = "Zhicheng Pan and Yihang Wang and Yingying Zhang and Sean Bin Yang and Yunyao Cheng and Peng Chen and Chenjuan Guo and Qingsong Wen and Xiduo Tian and Yunliang Dou and Zhiqiang Zhou and Chengcheng Yang and Aoying Zhou and Bin Yang", title = "{MagicScaler}: Uncertainty-Aware, Predictive Autoscaling", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3808--3821", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611566", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611566", abstract = "Predictive autoscaling is a key enabler for optimizing cloud resource allocation in Alibaba Cloud's computing platforms, which dynamically adjust the Elastic Compute Service", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Povzner:2023:KCN, author = "Anna Povzner and Prince Mahajan and Jason Gustafson and Jun Rao and Ismael Juma and Feng Min and Shriram Sridharan and Nikhil Bhatia and Gopi Attaluri and Adithya Chandra and Stanislav Kozlovski and Rajini Sivaram and Lucas Bradstreet and Bob Barrett and Dhruvil Shah and David Jacot and David Arthur and Ron Dagostino and Colin McCabe and Manikumar Reddy Obili and Kowshik Prakasam and Jose Garcia Sancio and Vikas Singh and Alok Nikhil and Kamal Gupta", title = "{Kora}: a Cloud-Native Event Streaming Platform for {Kafka}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3822--3834", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611567", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611567", abstract = "Event streaming is an increasingly critical infrastructure service used in many industries and there is growing demand for cloud-native solutions. Confluent \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pasupuleti:2023:ASE, author = "Krishna Kantikiran Pasupuleti and Jiakun Li and Hong Su and Mohamed Ziauddin", title = "Automatic {SQL} Error Mitigation in {Oracle}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3835--3847", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611568", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611568", abstract = "Despite best coding practices, software bugs are inevitable in a large codebase. In traditional databases, when errors occur during query processing, they \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2023:PFE, author = "Yanli Zhao and Andrew Gu and Rohan Varma and Liang Luo and Chien-Chin Huang and Min Xu and Less Wright and Hamid Shojanazeri and Myle Ott and Sam Shleifer and Alban Desmaison and Can Balioglu and Pritam Damania and Bernard Nguyen and Geeta Chauhan and Yuchen Hao and Ajit Mathews and Shen Li", title = "{PyTorch FSDP}: Experiences on Scaling Fully Sharded Data Parallel", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3848--3860", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611569", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Sep 18 10:22:20 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/python.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3611540.3611569", abstract = "It is widely acknowledged that large models have the potential to deliver superior performance across a broad range of domains. Despite the remarkable \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Keogh:2023:TSD, author = "Eamonn Keogh", title = "Time Series Data Mining: a Unifying View", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3861--3863", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611570", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Time series data are ubiquitous; large volumes of such data are routinely created in scientific, industrial, entertainment, medical and biological domains. Examples include ECG data, gait analysis, stock market quotes, machine health telemetry, search \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yow:2023:MLS, author = "Kai Siong Yow and Ningyi Liao and Siqiang Luo and Reynold Cheng", title = "Machine Learning for Subgraph Extraction: Methods, Applications and Challenges", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3864--3867", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611571", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Subgraphs are obtained by extracting a subset of vertices and a subset of edges from the associated original graphs, and many graph properties are known to be inherited by subgraphs. Subgraphs can be applied in many areas such as social networks, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ahmad:2023:PIR, author = "Ishtiyaque Ahmad and Divyakant Agrawal and Amr {El Abbadi} and Trinabh Gupta", title = "Private Information Retrieval in Large Scale Public Data Repositories", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3868--3871", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611572", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The tutorial focuses on Private Information Retrieval (PIR), which allows clients to privately query public or server-owned databases without disclosing their queries. The tutorial covers the basic concepts of PIR such as its types, construction, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pei:2023:DAM, author = "Jian Pei and Raul Castro Fernandez and Xiaohui Yu", title = "Data and {AI} Model Markets: Opportunities for Data and Model Sharing, Discovery, and Integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3872--3873", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611573", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The markets for data and AI models are rapidly emerging and increasingly significant in the realm and the practices of data science and artificial intelligence. These markets are being studied from diverse perspectives, such as e-commerce, economics, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Foufoulas:2023:EEU, author = "Yannis Foufoulas and Alkis Simitsis", title = "Efficient Execution of User-Defined Functions in {SQL} Queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3874--3877", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611574", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "User-defined functions (UDFs) have been widely used to overcome the expressivity limitations of SQL and complement its declarative nature with functional capabilities. UDFs are particularly useful in today's applications that involve complex data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Katsogiannis-Meimarakis:2023:NLI, author = "George Katsogiannis-Meimarakis and Mike Xydas and Georgia Koutrika", title = "Natural Language Interfaces for Databases with Deep Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3878--3881", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611575", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the age of the Digital Revolution, almost all human activities, from industrial and business operations to medical and academic research, are reliant on the constant integration and utilisation of ever-increasing volumes of data. However, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yan:2023:JOS, author = "Zhengtong Yan and Valter Uotila and Jiaheng Lu", title = "Join Order Selection with Deep Reinforcement Learning: Fundamentals, Techniques, and Challenges", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3882--3885", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611576", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Join Order Selection (JOS) is a fundamental challenge in query optimization, as it significantly affects query performance. However, finding an optimal join order is an NP-hard problem due to the exponentially large search space. Despite the decades-long \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Manolescu:2023:FPG, author = "Ioana Manolescu and Madhulika Mohanty", title = "Full-Power Graph Querying: State of the Art and Challenges", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3886--3889", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611577", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph databases are enjoying enormous popularity, through both their RDF and Property Graphs (PG) incarnations, in a variety of applications. To query graphs, query languages provide structured, as well as unstructured primitives. While structured \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gatterbauer:2023:TVR, author = "Wolfgang Gatterbauer", title = "A Tutorial on Visual Representations of Relational Queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3890--3893", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611578", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query formulation is increasingly performed by systems that need to guess a user's intent (e.g. via spoken word interfaces). But how can a user know that the computational agent is returning answers to the ``right'' query? More generally, given that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lerner:2023:DMN, author = "Alberto Lerner and Carsten Binnig and Philippe Cudr{\'e}-Mauroux and Rana Hussein and Matthias Jasny and Theo Jepsen and Dan R. K. Ports and Lasse Thostrup and Tobias Ziegler", title = "Databases on Modern Networks: a Decade of Research That Now Comes into Practice", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3894--3897", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611579", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern cloud networks are a fundamental pillar of data-intensive applications. They provide high-speed transaction (packet) rates and low overhead, enabling, for instance, truly scalable database designs. These networks, however, are fundamentally \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2023:BCD, author = "Zuozhi Wang and Chen Li", title = "Building a Collaborative Data Analytics System: Opportunities and Challenges", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3898--3901", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611580", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Real-time collaboration has become increasingly important in various applications, from document creation to data analytics. Although collaboration features are prevalent in editing applications, they remain rare in data-analytics applications, where the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{He:2023:PDO, author = "Wenjia He and Ibrahim Sabek and Yuze Lou and Michael Cafarella", title = "{PAINE Demo}: Optimizing Video Selection Queries with Commonsense Knowledge", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3902--3905", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611581", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Because video is becoming more popular and constitutes a major part of data collection, we have the need to process video selection queries --- selecting videos that contain target objects. However, a na{\"\i}ve scan of a video corpus without optimization \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xiao:2023:DDL, author = "Ziyang Xiao and Dongxiang Zhang and Zepeng Li and Sai Wu and Kian-Lee Tan and Gang Chen", title = "{DoveDB}: a Declarative and Low-Latency Video Database", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3906--3909", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611582", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Concerning the usability and efficiency to manage video data generated from large-scale cameras, we demonstrate DoveDB, a declarative and low-latency video database. We devise a more comprehensive video query language called VMQL to improve the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lew:2023:DDV, author = "Dong June Lew and Kihyun Yoo and Kwang Woo Nam", title = "{DeepVQL}: Deep Video Queries on {PostgreSQL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3910--3913", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611583", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The recent development of mobile and camera devices has led to the generation, sharing, and usage of massive amounts of video data. As a result, deep learning technology has gained attention as an alternative for video recognition and situation judgment. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tang:2023:DDD, author = "Xiu Tang and Sai Wu and Dongxiang Zhang and Ziyue Wang and Gongsheng Yuan and Gang Chen", title = "A Demonstration of {DLBD}: Database Logic Bug Detection System", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3914--3917", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611584", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database management systems (DBMSs) are prone to logic bugs that can result in incorrect query results. Current debugging tools are limited to single table queries and struggle with issues like lack of ground-truth results and repetitive query space \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ni:2023:PBB, author = "Wangze Ni and Pengze Chen and Lei Chen", title = "{PSFQ}: a Blockchain-Based Privacy-Preserving and Verifiable Student Feedback Questionnaire Platform", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3918--3921", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611585", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently, more and more higher education institutions have been using student feedback questionnaires (SFQ) to evaluate teaching. However, existing SFQ systems have two shortcomings. The first is that the respondent of an SFQ is not anonymous. The second \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Anneser:2023:QII, author = "Christoph Anneser and Mario Petruccelli and Nesime Tatbul and David Cohen and Zhenggang Xu and Prithviraj Pandian and Nikolay Laptev and Ryan Marcus and Alfons Kemper", title = "{QO-Insight}: Inspecting Steered Query Optimizers", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3922--3925", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611586", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Steered query optimizers address the planning mistakes of traditional query optimizers by providing them with hints on a per-query basis, thereby guiding them in the right direction. This paper introduces QO-Insight, a visual tool designed for exploring \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shen:2023:LGQ, author = "Zhihong Shen and Chuan Hu and Zihao Zhao", title = "{Lynx}: a Graph Query Framework for Multiple Heterogeneous Data Sources", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3926--3929", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611587", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph model are increasingly popular among modern applications for its ability to model complex relationships between entities. Users tend to query the data as a graph with graph operations (e.g., graph navigation and exploration). However, a large \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lepping:2023:SDM, author = "Aljoscha Lepping and Hoang Mi Pham and Laura Mons and Balint Rueb and Philipp M. Grulich and Ankit Chaudhary and Steffen Zeuch and Volker Markl", title = "Showcasing Data Management Challenges for Future {IoT} Applications with {NebulaStream}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3930--3933", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611588", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data management systems will face several new challenges in supporting IoT applications during the coming years. These challenges arise from managing large numbers of heterogeneous IoT devices and require combining elastic cloud and fog resources in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2023:CLC, author = "Zilong Wang and Qixiong Zeng and Ning Wang and Haowen Lu and Yue Zhang", title = "{CEDA}: Learned Cardinality Estimation with Domain Adaptation", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3934--3937", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611589", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cardinality Estimation (CE) is a fundamental but critical problem in DBMS query optimization, while deep learning techniques have made significant breakthroughs in the research of CE. However, apart from requiring sufficiently large training data to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zalipynis:2023:FAN, author = "Ramon Antonio Rodriges Zalipynis", title = "{FastMosaic} in Action: a New Mosaic Operator for Array {DBMSs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3938--3941", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611590", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Array DBMSs operate on N -d arrays. During the Data Ingestion phase, the widely used mosaic operator ingests a massive collection of overlapping arrays into a single large array, called mosaic. The operator can utilize sophisticated statistical and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ma:2023:SNM, author = "Zhuo Ma and Yilong Yang and Bin Xiao and Yang Liu and Xinjing Liu and Zhuoran Ma and Tong Yang", title = "{Sniffer}: a Novel Model Type Detection System against Machine-Learning-as-a-Service Platforms", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3942--3945", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611591", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent works explore several attacks against Machine-Learning-as-a-Service (MLaaS) platforms (e.g., the model stealing attack), allegedly posing potential real-world threats beyond viability in laboratories. However, hampered by model-type-sensitive, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2023:KKG, author = "Xiang Wang and Xin Wang and Zhaozhuo Li and Dong Han", title = "{KGNav}: a Knowledge Graph Navigational Visual Query System", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3946--3949", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611592", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Visual query is a vital technique for comprehending and analyzing knowledge graphs, which provides an effective method to lower the barrier of querying knowledge graphs for non-professional users. Nevertheless, visual query techniques for knowledge \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mun:2023:FDT, author = "Ju Hyoung Mun and Konstantinos Karatsenidis and Tarikul Islam Papon and Shahin Roozkhosh and Denis Hoornaert and Ulrich Drepper and Ahmed Sanaullah and Renato Mancuso and Manos Athanassoulis", title = "On-the-Fly Data Transformation in Action", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3950--3953", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611593", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Transactional and analytical database management systems (DBMS) typically employ different data layouts: row-stores for the first and column-stores for the latter. In order to bridge the requirements of the two without maintaining two systems and two \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Choi:2023:DWS, author = "Dalsu Choi and Hyunsik Yoon and Hyubjin Lee and Yon Dohn Chung", title = "Demonstrating Waffle: a Self-Driving Grid Index", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3954--3957", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611594", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper demonstrates Waffle, a self-driving grid indexing system for moving objects. We introduce system architecture, system workflow, and user scenarios. Waffle enables the management of moving objects with less human effort while automatically \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bylois:2023:CED, author = "Niels Bylois and Frank Neven and Stijn Vansummeren", title = "{CM-Explorer}: Dissecting Data Ingestion Problems", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3958--3961", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611595", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data ingestion validation, the task of certifying the quality of continuously collected data, is crucial to ensure trustworthiness of analytics insights. A widely used approach for validating data quality is to specify, either manually or automatically, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2023:EDP, author = "Tingyu Wang and Yuchao Tao and Amir Gilad and Ashwin Machanavajjhala and Sudeepa Roy", title = "Explaining Differentially Private Query Results with {DPXPlain}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3962--3965", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611596", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Employing Differential Privacy (DP), the state-of-the-art privacy standard, to answer aggregate database queries poses new challenges for users to understand the trends and anomalies observed in the query results: Is the unexpected answer due to the data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xiao:2023:GAC, author = "Fei Xiao and Jiong Xie and Zhida Chen and Feifei Li and Zhen Chen and Jianwei Liu and Yinpei Liu", title = "{Ganos Aero}: a Cloud-Native System for Big Raster Data Management and Processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3966--3969", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611597", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The development of Earth Observation technology contributes to the production of massive raster data. It is vital to manage and conduct analytical tasks on the raster data. Existing solutions employ dedicated systems for the raster data management and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ghorbani:2023:DOF, author = "Mahdi Ghorbani and Amir Shaikhha", title = "Demonstration of {OpenDBML}, a Framework for Democratizing In-Database Machine Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3970--3973", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611598", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Machine learning over relational data has been used in several applications. The traditional approach of joining relations first and then training a model on the joined table is time-consuming and requires a significant amount of memory. Recent research \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Abdallah:2023:DSM, author = "Hussein Abdallah and Waleed Afandi and Essam Mansour", title = "Demonstration of {SPARQL ML} : an Interfacing Language for Supporting Graph Machine Learning for {RDF} Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3974--3977", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611599", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This demo paper presents KGNet, a graph machine learning-enabled RDF engine. KGNet integrates graph machine learning (GML) models with existing RDF engines as query operators to support node classification and link prediction tasks. For easy integration, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:EVD, author = "Enhao Zhang and Maureen Daum and Dong He and Manasi Ganti and Brandon Haynes and Ranjay Krishna and Magdalena Balazinska", title = "{EQUI-VOCAL} Demonstration: Synthesizing Video Queries from User Interactions", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3978--3981", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611600", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate EQUI-VOCAL, a system that synthesizes compositional queries over videos from user feedback. EQUI-VOCAL enables users to query a video database for complex events by providing a few positive and negative examples of what they are looking \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qiu:2023:TMT, author = "Yuanhui Qiu and Chenguang Fang and Shaoxu Song and Xiangdong Huang and Chen Wang and Jianmin Wang", title = "{TsQuality}: Measuring Time Series Data Quality in {Apache IoTDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3982--3985", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611601", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Time series has been found with various data quality issues, e.g., owing to sensor failure or network transmission errors in the Internet of Things (IoT). It is highly demanded to have an overview of the data quality issues on the millions of time series \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2023:AQC, author = "Congying Wang and Nithin Sastry Tellapuri and Sphoorthi Keshannagari and Dylan Zinsley and Zhuoyue Zhao and Dong Xie", title = "Approximate Queries over Concurrent Updates", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3986--3989", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611602", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Approximate Query Processing (AQP) systems produce estimation of query answers using small random samples. It is attractive for the users who are willing to trade accuracy for low query latency. On the other hand, real-world data are often subject to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fritsch:2023:SHV, author = "Kristin Fritsch and Stefanie Scherzinger", title = "Solving Hard Variants of Database Schema Matching on Quantum Computers", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3990--3993", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611603", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With quantum computers now available as cloud services, there is a global quest for applications where a quantum advantage can be shown. Naturally, data management is a candidate domain. Workable solutions require the design of hybrid quantum algorithms, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bonifati:2023:ICM, author = "Angela Bonifati and Francesco {Del Buono} and Francesco Guerra and Miki Lombardi and Donato Tiano", title = "Interpretable Clustering of Multivariate Time Series with {Time2Feat}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3994--3997", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611604", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper showcases Time2Feat, an end-to-end machine learning system for Multivariate Time Series (MTS) clustering. The system relies on interpretable inter-signal and intra-signal features extracted from the time series. Then, a dimensionality \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:DQE, author = "Chaozu Zhang and Qiaomu Shen and Bo Tang", title = "{DHive}: Query Execution Performance Analysis via Dataflow in {Apache Hive}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "3998--4001", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611605", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Nowadays, Apache Hive has been widely used for large-scale data analysis applications in many organizations. Various visual analytical tools are developed to help Hive users quickly analyze the query execution process and identify the performance \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Grafberger:2023:MWI, author = "Stefan Grafberger and Shubha Guha and Paul Groth and Sebastian Schelter", title = "{mlwhatif}: What If You Could Stop Re-Implementing Your Machine Learning Pipeline Analyses over and over?", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4002--4005", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611606", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Software systems that learn from data with machine learning (ML) are used in critical decision-making processes. Unfortunately, real-world experience shows that the pipelines for data preparation, feature encoding and model training in ML systems are \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Peng:2023:AAC, author = "Yuchen Peng and Ke Chen and Lidan Shou and Dawei Jiang and Gang Chen", title = "{AQUA}: Automatic Collaborative Query Processing in Analytical Database", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4006--4009", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611607", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data analysts nowadays are keen to have analytical capabilities involving deep learning (DL). Collaborative queries, which employ relational operations to process structured data and DL models to process unstructured data, provide a powerful facility for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2023:VBG, author = "Kai Huang and Houdong Liang and Chongchong Yao and Xi Zhao and Yue Cui and Yao Tian and Ruiyuan Zhang and Xiaofang Zhou", title = "{VisualNeo}: Bridging the Gap between Visual Query Interfaces and Graph Query Engines", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4010--4013", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611608", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Visual Graph Query Interfaces (VQIs) empower non-programmers to query graph data by constructing visual queries intuitively. Devising efficient technologies in Graph Query Engines (GQEs) for interactive search and exploration has also been studied for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bellomarini:2023:KRI, author = "Luigi Bellomarini and Marco Benedetti and Andrea Gentili and Davide Magnanimi and Emanuel Sallinger", title = "{KG-Roar}: Interactive {Datalog}-Based Reasoning on Virtual Knowledge Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4014--4017", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611609", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Logic-based Knowledge Graphs (KGs) are gaining momentum in academia and industry thanks to the rise of expressive and efficient languages for Knowledge Representation and Reasoning (KRR). These languages accurately express business rules, through which \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schafer:2023:UBD, author = "Nico Sch{\"a}fer and Damjan Gjurovski and Angjela Davitkova and Sebastian Michel", title = "To {UDFs} and Beyond: Demonstration of a Fully Decomposed Data Processor for General Data Wrangling Tasks", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4018--4021", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611610", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "While existing data management solutions try to keep up with novel data formats and features, a myriad of valuable functionality is often only accessible via programming language libraries. Particularly for machine learning tasks, there is a wealth of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2023:CAH, author = "Yushi Liu and Liwei Yuan and Zhihao Chen and Yekai Yu and Zhao Zhang and Cheqing Jin and Ying Yan", title = "{ChainDash}: an Ad-Hoc Blockchain Data Analytics System", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4022--4025", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611611", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The emergence of digital asset applications, driven by Web 3.0 and powered by blockchain technology, has led to a growing demand for blockchain-specific graph analytics to unearth the insights. However, current blockchain data analytics systems are \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zecchini:2023:BER, author = "Luca Zecchini and Giovanni Simonini and Sonia Bergamaschi and Felix Naumann", title = "{BrewER}: Entity Resolution On-Demand", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4026--4029", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611612", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The task of entity resolution (ER) aims to detect multiple records describing the same real-world entity in datasets and to consolidate them into a single consistent record. ER plays a fundamental role in guaranteeing good data quality, e.g., as input \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2023:VSF, author = "Fanchao Chen and Dixin Tang and Haotian Li and Aditya G. Parameswaran", title = "Visualizing Spreadsheet Formula Graphs Compactly", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4030--4033", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611613", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Spreadsheets are a ubiquitous data analysis tool, empowering non-programmers and programmers alike to easily express their computations by writing formulae alongside data. The dependencies created by formulae are tracked as formula graphs, which play a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{tenWolde:2023:DBS, author = "Daniel ten Wolde and G{\'a}bor Sz{\'a}rnyas and Peter Boncz", title = "{DuckPGQ}: Bringing {SQL\slash PGQ} to {DuckDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4034--4037", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611614", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate the most important new feature of SQL:2023, namely SQL/PGQ, which eases querying graphs using SQL by introducing new syntax for pattern matching and (shortest) path-finding. We show how support for SQL/PGQ can be integrated into an RDBMS, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bai:2023:DQS, author = "Qiushi Bai and Sadeem Alsudais and Chen Li", title = "Demo of {QueryBooster}: Supporting Middleware-Based {SQL} Query Rewriting as a Service", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4038--4041", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611615", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query rewriting is an important technique to optimize SQL performance in databases. With the prevalent use of business intelligence systems and object-relational mapping frameworks, existing rewriting capabilities inside databases are insufficient to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2023:WCU, author = "Weiyuan Wu and Pei Wang and Yi Xie and Yejia Liu and George Chow and Jiannan Wang", title = "{Web Connector}: a Unified {API} Wrapper to Simplify {Web} Data Collection", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4042--4045", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611616", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Collecting structured data from Web APIs, such as the Twitter API, Yelp Fusion API, Spotify API, and DBLP API, is a common task in the data science lifecycle, but it requires advanced programming skills for data scientists. To simplify web data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gao:2023:FRR, author = "Dawei Gao and Daoyuan Chen and Zitao Li and Yuexiang Xie and Xuchen Pan and Yaliang Li and Bolin Ding and Jingren Zhou", title = "{FS-Real}: a Real-World Cross-Device Federated Learning Platform", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4046--4049", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611617", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Federated learning (FL) is a general distributed machine learning paradigm that provides solutions for tasks where data cannot be shared directly. Due to the difficulties in communication management and heterogeneity of distributed data and devices, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Song:2023:AAD, author = "Xintong Song and Yusen Zhu and Jianfei Wu and Bai Liu and Hongkang Wei", title = "{ADOps}: an Anomaly Detection Pipeline in Structured Logs", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4050--4053", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611618", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Anomaly detection has been extensively implemented in industry. The reality is that an application may have numerous scenarios where anomalies need to be monitored. However, the complete process of anomaly detection will take much time, including data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Spenger:2023:PSM, author = "Jonas Spenger and Chengyang Huang and Philipp Haller and Paris Carbone", title = "{Portals}: a Showcase of Multi-Dataflow Stateful Serverless", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4054--4057", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611619", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Serverless applications spanning the cloud and edge require flexible programming frameworks for expressing compositions across the different levels of deployment. Another critical aspect for applications with state is failure resilience beyond the scope \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Singh:2023:CLS, author = "Mukul Singh and Jos{\'e} Cambronero Sanchez and Sumit Gulwani and Vu Le and Carina Negreanu and Gust Verbruggen", title = "{Cornet}: Learning Spreadsheet Formatting Rules by Example", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4058--4061", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611620", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data management and analysis tasks are often carried out using spreadsheet software. A popular feature in most spreadsheet platforms is the ability to define data-dependent formatting rules. These rules can express actions such as ``color red all entries \ldots{}''", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qian:2023:FIS, author = "Chen Qian and Shiwei Liang and Zhaoyang Wang and Yin Lou", title = "{Fanglue}: an Interactive System for Decision Rule Crafting", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4062--4065", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611621", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In many applications the training data do not always contain sufficient information to produce high-quality decision rules for standard (end-to-end) rule mining algorithms, and human experts have to incorporate domain knowledge during rule induction in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Paparrizos:2023:OEE, author = "John Paparrizos and Sai Prasanna Teja Reddy", title = "{Odyssey}: an Engine Enabling the Time-Series Clustering Journey", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4066--4069", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611622", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Clustering is one of the most popular time-series tasks because it enables unsupervised data exploration and often serves as a subroutine or preprocessing step for other tasks. Despite being the subject of active research across disciplines for decades, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:EQR, author = "Jinyang Li and Alon Silberstein and Yuval Moskovitch and Julia Stoyanovich and H. V. Jagadish", title = "{Erica}: Query Refinement for Diversity Constraint Satisfaction", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4070--4073", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611623", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Relational queries are commonly used to support decision making in critical domains like hiring and college admissions. For example, a college admissions officer may need to select a subset of the applicants for in-person interviews, who individually \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2023:LMG, author = "Zui Chen and Lei Cao and Sam Madden", title = "{Lingua Manga} : a Generic Large Language Model Centric System for Data Curation", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4074--4077", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611624", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data curation is a wide-ranging area which contains many critical but time-consuming data processing tasks. However, the diversity of such tasks makes it challenging to develop a general-purpose data curation system. To address this issue, we present \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gavriilidis:2023:XAD, author = "Haralampos Gavriilidis and Leonhard Rose and Joel Ziegler and Kaustubh Beedkar and Jorge-Arnulfo Quian{\'e}-Ruiz and Volker Markl", title = "{XDB} in Action: Decentralized Cross-Database Query Processing for Black-Box {DBMSes}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4078--4081", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611625", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data are naturally produced at different locations and hence stored on different DBMSes. To maximize the value of the collected data, today's users combine data from different sources. Research in data integration has proposed the Mediator-Wrapper (MW) \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kakkar:2023:IDE, author = "Gaurav Tarlok Kakkar and Aryan Rajoria and Myna Prasanna Kalluraya and Ashmita Raju and Jiashen Cao and Kexin Rong and Joy Arulraj", title = "Interactive Demonstration of {EVA}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4082--4085", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611626", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demonstration, we will present EVA, an end-to-end AI-Relational database management system. We will demonstrate the capabilities and utility of EVA using three usage scenarios: (1) EVA serves as a backend for an exploratory video analytics \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:RSO, author = "Xiling Li and Gefei Tan and Xiao Wang and Jennie Rogers and Soamar Homsi", title = "{RESCU-SQL}: Oblivious Querying for the Zero Trust Cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4086--4089", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611627", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cloud service providers offer robust infrastructure for rent to organizations of all kinds. High stakes applications, such as the ones in defense and healthcare, are turning to the public cloud for a cost-effective, geographically distributed, always \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Abdelaziz:2023:DST, author = "Ibrahim Abdelaziz and Julian Dolby and Udayan Khurana and Horst Samulowitz and Kavitha Srinivas", title = "{DataRinse}: Semantic Transforms for Data Preparation Based on Code Mining", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4090--4093", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611628", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data preparation is a crucial first step to any data analysis problem. This task is largely manual, performed by a person familiar with the data domain. DataRinse is a system designed to extract relevant transforms from large scale static analysis of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2023:DAA, author = "Junxiong Wang and Mitchell Gray and Immanuel Trummer and Ahmet Kara and Dan Olteanu", title = "Demonstrating {ADOPT}: Adaptively Optimizing Attribute Orders for Worst-Case Optimal Joins via Reinforcement Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4094--4097", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611629", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Performance of worst-case optimal join algorithms depends on the order in which the join attributes are processed. It is challenging to identify suitable orders prior to query execution due to the huge search space of possible orders and unreliable \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Trummer:2023:DGD, author = "Immanuel Trummer", title = "Demonstrating {GPT-DB}: Generating Query-Specific and Customizable Code for {SQL} Processing with {GPT-4}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4098--4101", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611630", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "GPT-DB generates code for SQL processing in general-purpose programming languages such as Python. Generated code can be freely customized using user-provided natural language instructions. This enables users, for instance, to try out specific libraries \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{deAlmeida:2023:SVA, author = "Vicente Nejar de Almeida and Eduardo Ribeiro and Nassim Bouarour and Jo{\~a}o Luiz Dihl Comba and Sihem Amer-Yahia", title = "{SHEVA}: a Visual Analytics System for Statistical Hypothesis Exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4102--4105", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611631", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate SHEVA, a System for Hypothesis Exploration with Visual Analytics. SHEVA adopts an Exploratory Data Analysis (EDA) approach to discovering statistically-sound insights from large datasets. The system addresses three longstanding challenges \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qiao:2023:PGI, author = "Shi Qiao and Alekh Jindal", title = "{PikePlace}: Generating Intelligence for Marketplace Datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4106--4109", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611632", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "There is a renewed interest in data marketplaces with cloud data warehouses that make sharing and accessing data on-demand and extremely easy. However, analyzing marketplace datasets is challenge since current tools for creating the data models are \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2023:LQR, author = "Xuanhe Zhou and Guoliang Li and Jianming Wu and Jiesi Liu and Zhaoyan Sun and Xinning Zhang", title = "A Learned Query Rewrite System", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4110--4113", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611633", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query rewriting is a challenging task that transforms a SQL query to improve its performance while maintaining its result set. However, it is difficult to rewrite SQL queries, which often involve complex logical structures, and there are numerous \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Halevy:2023:WLR, author = "Alon Halevy and Yejin Choi and Avrilia Floratou and Michael J. Franklin and Natasha Noy and Haixun Wang", title = "Will {LLMs} Reshape, Supercharge, or Kill Data Science? ({VLDB 2023} Panel)", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4114--4115", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611634", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large language models (LLMs) have recently taken the world by storm, promising potentially game changing opportunities in multiple fields. Naturally, there is significant promise in applying LLMs to the management of structured data, or more generally, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cheung:2023:TAG, author = "Alvin Cheung and Maaz {Bin Safeer Ahmad} and Brandon Haynes and Chanwut Kittivorawong and Shadaj Laddad and Xiaoxuan Liu and Chenglong Wang and Cong Yan", title = "Towards Auto-Generated Data Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4116--4129", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611635", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "After decades of progress, database management systems (DBMSs) are now the backbones of many data applications that we interact with on a daily basis. Yet, with the emergence of new data types and hardware, building and optimizing new data systems remain \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dong:2023:GKG, author = "Xin Luna Dong", title = "Generations of Knowledge Graphs: The Crazy Ideas and the Business Impact", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4130--4137", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611636", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Knowledge Graphs (KGs) have been used to support a wide range of applications, from web search to personal assistant. In this paper, we describe three generations of knowledge graphs: entity-based KGs, which have been supporting general search and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gonzalez:2023:SGS, author = "Joseph E. Gonzalez and Yucheng Low", title = "The Story of {GraphLab} --- From Scaling Machine Learning to Shaping Graph Systems Research ({VLDB 2023 Test-of-Time Award} Talk)", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4138--4138", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611637", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The GraphLab project spanned almost a decade and had profound academic and industrial impact on large-scale machine learning and graph processing systems. There were numerous papers written describing the innovations in GraphLab including the original \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Choi:2023:CSD, author = "Yejin Choi", title = "Common Sense: The Dark Matter of Language and Intelligence ({VLDB 2023} Keynote)", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4139--4139", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611638", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Scale appears to be the winning recipe in today's leaderboards. And yet, extreme-scale neural models are (un)surprisingly brittle and make errors that are often nonsensical and even counterintuitive. In this talk, I will argue for the importance of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:MDC, author = "Feifei Li", title = "Modernization of Databases in the Cloud Era: Building Databases that Run Like {Legos}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4140--4151", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611639", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Utilizing cloud for common and critical computing infrastructures has already become the norm across the board. The rapid evolvement of the underlying cloud infrastructure and the revolutionary development of AI present both challenges and opportunities \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Milic-Frayling:2023:CCT, author = "Natasa Milic-Frayling", title = "On the Cusp: Computing Thrills and Perils and Professional Awakening", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "12", pages = "4152--4159", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3611540.3611640", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:38:16 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Over the past eight decades, computer science has advanced as a field, and the computing profession has matured by establishing professional codes of conduct, fostering best practices, and establishing industry standards to support the proliferation of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qiu:2023:DDO, author = "Lina Qiu and Georgios Kellaris and Nikos Mamoulis and Kobbi Nissim and George Kollios", title = "{Doquet}: Differentially Oblivious Range and Join Queries with Private Data Structures", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "13", pages = "4160--4173", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3625054.3625055", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:04 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3625054.3625055", abstract = "Most cloud service providers offer limited data privacy guarantees, discouraging clients from using them for managing their sensitive data. Cloud providers may use servers with Trusted Execution Environments (TEEs) to protect outsourced data, while \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chiosa:2023:AAC, author = "Monica Chiosa and Thomas B. Preu{\ss}er and Michaela Blott and Gustavo Alonso", title = "{AMNES}: Accelerating the Computation of Data Correlation Using {FPGAs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "13", pages = "4174--4187", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3625054.3625056", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:04 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3625054.3625056", abstract = "A widely used approach to characterize input data in both databases and ML is computing the correlation between attributes. The operation is supported by all major database engines and ML platforms. However, it is an expensive operation as the number of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Daum:2023:VPY, author = "Maureen Daum and Enhao Zhang and Dong He and Stephen Mussmann and Brandon Haynes and Ranjay Krishna and Magdalena Balazinska", title = "{VOCALExplore}: Pay-as-You-Go Video Data Exploration and Model Building", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "13", pages = "4188--4201", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3625054.3625057", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:04 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3625054.3625057", abstract = "We introduce VOCALExplore, a system designed to support users in building domain-specific models over video datasets. VOCALExplore supports interactive labeling sessions and trains models using user-supplied labels. VOCALExplore maximizes model quality \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Arora:2023:FRA, author = "Pankaj Arora and Surajit Chaudhuri and Sudipto Das and Junfeng Dong and Cyril George and Ajay Kalhan and Arnd Christian K{\"o}nig and Willis Lang and Changsong Li and Feng Li and Jiaqi Liu and Lukas M. Maas and Akshay Mata and Ishai Menache and Justin Moeller and Vivek Narasayya and Matthaios Olma and Morgan Oslake and Elnaz Rezai and Yi Shan and Manoj Syamala and Shize Xu and Vasileios Zois", title = "Flexible Resource Allocation for Relational Database-as-a-Service", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "13", pages = "4202--4215", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3625054.3625058", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:04 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3625054.3625058", abstract = "Oversubscription is an essential cost management strategy for cloud database providers, and its importance is magnified by the emerging paradigm of serverless databases. In contrast to general purpose techniques used for oversubscription in hypervisors, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gu:2023:SEA, author = "Rong Gu and Han Li and Haipeng Dai and Wenjie Huang and Jie Xue and Meng Li and Jiaqi Zheng and Haoran Cai and Yihua Huang and Guihai Chen", title = "{ShadowAQP}: Efficient Approximate Group-by and Join Query via Attribute-Oriented Sample Size Allocation and Data Generation", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "13", pages = "4216--4229", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3625054.3625059", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:04 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3625054.3625059", abstract = "Approximate query processing (AQP) is one of the key techniques to cope with big data querying problem on account that it obtains approximate answers efficiently. To address non-trivial sample selection and heavy sampling cost issues in AQP, we propose \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2023:ODP, author = "Rui Liu and Kwanghyun Park and Fotis Psallidas and Xiaoyong Zhu and Jinghui Mo and Rathijit Sen and Matteo Interlandi and Konstantinos Karanasos and Yuanyuan Tian and Jes{\'u}s Camacho-Rodr{\'\i}guez", title = "Optimizing Data Pipelines for Machine Learning in Feature Stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "13", pages = "4230--4239", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3625054.3625060", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:04 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3625054.3625060", abstract = "Data pipelines (i.e., converting raw data to features) are critical for machine learning (ML) models, yet their development and management is time-consuming. Feature stores have recently emerged as a new ``DBMS-for-ML'' with the premise of enabling data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Angles:2023:SSE, author = "Renzo Angles and Georg Gottlob and Aleksandar Pavlovi{\'c} and Reinhard Pichler and Emanuel Sallinger", title = "{SparqLog}: a System for Efficient Evaluation of {SPARQL 1.1} Queries via {Datalog}", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "13", pages = "4240--4253", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3625054.3625061", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:04 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3625054.3625061", abstract = "Over the past decade, Knowledge Graphs have received enormous interest both from industry and from academia. Research in this area has been driven, above all, by the Database (DB) community and the Semantic Web (SW) community. However, there still \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Konig:2023:SLC, author = "Arnd Christian K{\"o}nig and Yi Shan and Karan Newatia and Luke Marshall and Vivek Narasayya", title = "Solver-In-The-Loop Cluster Resource Management for Database-as-a-Service", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "13", pages = "4254--4267", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3625054.3625062", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:04 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3625054.3625062", abstract = "In Database-as-a-Service (DBaaS) clusters, resource management is a complex optimization problem that assigns tenants to nodes, subject to various constraints and objectives. Tenants share resources within a node, however, their resource demands can \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Henneberg:2023:REH, author = "Justus Henneberg and Felix Schuhknecht", title = "{RTIndeX}: Exploiting Hardware-Accelerated {GPU} Raytracing for Database Indexing", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "13", pages = "4268--4281", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3625054.3625063", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:04 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3625054.3625063", abstract = "Data management on GPUs has become increasingly relevant due to a tremendous rise in processing power and available GPU memory. Similar to main-memory systems, there is a need for performant GPU-resident index structures to speed up query processing. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lian:2023:CCT, author = "Jinqing Lian and Xinyi Zhang and Yingxia Shao and Zenglin Pu and Qingfeng Xiang and Yawen Li and Bin Cui", title = "{ContTune}: Continuous Tuning by Conservative {Bayesian} Optimization for Distributed Stream Data Processing Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "13", pages = "4282--4295", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3625054.3625064", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:04 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3625054.3625064", abstract = "The past decade has seen rapid growth of distributed stream data processing systems. Under these systems, a stream application is realized as a Directed Acyclic Graph (DAG) of operators, where the level of parallelism of each operator has a substantial \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Melissourgos:2023:SUS, author = "Dimitrios Melissourgos and Haibo Wang and Shigang Chen and Chaoyi Ma and Shiping Chen", title = "Single Update Sketch with Variable Counter Structure", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "13", pages = "4296--4309", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3625054.3625065", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:04 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3625054.3625065", abstract = "Per-flow size measurement is key to many streaming applications and management systems, particularly in high-speed networks. Performing such measurement on the data plane of a network device at the line rate requires on-chip memory and computing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Trummer:2023:CLL, author = "Immanuel Trummer", title = "Can Large Language Models Predict Data Correlations from Column Names?", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "13", pages = "4310--4323", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3625054.3625066", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:04 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3625054.3625066", abstract = "Recent publications suggest using natural language analysis on database schema elements to guide tuning and profiling efforts. The underlying hypothesis is that state-of-the-art language processing methods, so-called language models, are able to extract \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chamani:2023:GTO, author = "Javad Ghareh Chamani and Ioannis Demertzis and Dimitrios Papadopoulos and Charalampos Papamanthou and Rasool Jalili", title = "{GraphOS}: Towards Oblivious Graph Processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "13", pages = "4324--4338", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3625054.3625067", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:04 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3625054.3625067", abstract = "We propose GraphOS, a system that allows a client that owns a graph database to outsource it to an untrusted server for storage and querying. It relies on doubly-oblivious primitives and trusted hardware to achieve a very strong privacy and efficiency \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2023:COC, author = "Kefei Wang and Feng Chen", title = "{Catalyst}: Optimizing Cache Management for Large In-memory Key-value Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "16", number = "13", pages = "4339--4352", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3625054.3625068", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:04 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3625054.3625068", abstract = "In-memory key-value cache systems, such as Memcached and Redis, are essential in today's data centers. A key mission of such cache systems is to identify the most valuable data for caching. To achieve this, the current system design keeps track of each \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zheng:2023:DDL, author = "Bolong Zheng and Yongyong Gao and Jingyi Wan and Lingsen Yan and Long Hu and Bo Liu and Yunjun Gao and Xiaofang Zhou and Christian S. Jensen", title = "{DecLog}: Decentralized Logging in Non-Volatile Memory for Time Series Database Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "1", pages = "1--14", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3617838.3617839", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:06 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3617838.3617839", abstract = "Growing demands for the efficient processing of extreme-scale time series workloads call for more capable time series database management systems (TSDBMS). Specifically, to maintain consistency and durability of transaction processing, systems employ \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:EDW, author = "Fangyuan Zhang and Mengxu Jiang and Sibo Wang", title = "Efficient Dynamic Weighted Set Sampling and Its Extension", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "1", pages = "15--27", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3617838.3617840", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:06 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3617838.3617840", abstract = "Given a weighted set S of n elements, weighted set sampling (WSS) samples an element in S so that each element a$_i$; is sampled with a probability proportional to its weight w ( a$_i$ ). The classic alias method pre-processes an index in O ( n ) time with O ( n ) \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2023:ZLI, author = "Yiming Lin and Sharad Mehrotra", title = "{ZIP}: Lazy Imputation during Query Processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "1", pages = "28--40", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3617838.3617841", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:06 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3617838.3617841", abstract = "This paper develops a query-time missing value imputation framework, entitled ZIP, that modifies relational operators to be imputation aware in order to minimize the joint cost of imputing and query processing. The modified operators use a cost-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:FTA, author = "Xunkai Li and Zhengyu Wu and Wentao Zhang and Yinlin Zhu and Rong-Hua Li and Guoren Wang", title = "{FedGTA}: Topology-Aware Averaging for Federated Graph Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "1", pages = "41--50", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3617838.3617842", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:06 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3617838.3617842", abstract = "Federated Graph Learning (FGL) is a distributed machine learning paradigm that enables collaborative training on large-scale subgraphs across multiple local systems. Existing FGL studies fall into two categories: (i) FGL Optimization, which improves \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chang:2023:HPM, author = "Xueqin Chang and Xiangyu Ke and Lu Chen and Congcong Ge and Ziheng Wei and Yunjun Gao", title = "Host Profit Maximization: Leveraging Performance Incentives and User Flexibility", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "1", pages = "51--64", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3617838.3617843", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:06 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3617838.3617843", abstract = "The social network host has knowledge of the network structure and user characteristics and can earn a profit by providing merchants with viral marketing campaigns. We investigate the problem of host profit maximization by leveraging performance \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Patwa:2023:DPP, author = "Shweta Patwa and Danyu Sun and Amir Gilad and Ashwin Machanavajjhala and Sudeepa Roy", title = "{DP-PQD}: Privately Detecting Per-Query Gaps in Synthetic Data Generated by Black-Box Mechanisms", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "1", pages = "65--78", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3617838.3617844", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 5 08:24:06 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3617838.3617844", abstract = "Synthetic data generation methods, and in particular, private synthetic data generation methods, are gaining popularity as a means to make copies of sensitive databases that can be shared widely for research and data analysis. Some of the fundamental \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wei:2023:CSP, author = "Ruidi Wei and Florian Kerschbaum", title = "Cryptographically Secure Private Record Linkage using Locality-Sensitive Hashing", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "2", pages = "79--91", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3626292.3626293", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 12 09:42:35 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3626292.3626293", abstract = "Private record linkage (PRL) is the problem of identifying pairs of records that approximately match across datasets in a secure, privacy-preserving manner. Two-party PRL specifically allows each of the parties to obtain records from the other party, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Arora:2023:LME, author = "Simran Arora and Brandon Yang and Sabri Eyuboglu and Avanika Narayan and Andrew Hojel and Immanuel Trummer and Christopher R{\'e}", title = "Language Models Enable Simple Systems for Generating Structured Views of Heterogeneous Data Lakes", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "2", pages = "92--105", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3626292.3626294", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 12 09:42:35 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3626292.3626294", abstract = "A long standing goal in the data management community is developing systems that input documents and output queryable tables without user effort. Given the sheer variety of potential documents, state-of-the art systems make simplifying assumptions and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:QRD, author = "Jinyang Li and Yuval Moskovitch and Julia Stoyanovich and H. V. Jagadish", title = "Query Refinement for Diversity Constraint Satisfaction", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "2", pages = "106--118", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3626292.3626295", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 12 09:42:35 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3626292.3626295", abstract = "Diversity, group representation, and similar needs often apply to query results, which in turn require constraints on the sizes of various subgroups in the result set. Traditional relational queries only specify conditions as part of the query predicate \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:EEL, author = "Zhaoheng Li and Pranav Gor and Rahul Prabhu and Hui Yu and Yuzhou Mao and Yongjoo Park", title = "{ElasticNotebook}: Enabling Live Migration for Computational Notebooks", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "2", pages = "119--133", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3626292.3626296", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 12 09:42:35 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3626292.3626296", abstract = "Computational notebooks (e.g., Jupyter, Google Colab) are widely used for interactive data science and machine learning. In those frameworks, users can start a session, then execute cells (i.e., a set of statements) to create variables, train models, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2023:BNL, author = "Kecheng Huang and Zhaoyan Shen and Zili Shao and Tong Zhang and Feng Chen", title = "Breathing New Life into an Old Tree: Resolving Logging Dilemma of {B$^+$}-tree on Modern Computational Storage Drives", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "2", pages = "134--147", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3626292.3626297", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 12 09:42:35 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3626292.3626297", abstract = "Having dominated databases and various data management systems for decades, B$^+$-tree is infamously subject to a logging dilemma: One could improve B$^+$-tree speed performance by equipping it with a larger log, which nevertheless will degrade its crash \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zeng:2023:EEC, author = "Xinyu Zeng and Yulong Hui and Jiahong Shen and Andrew Pavlo and Wes McKinney and Huanchen Zhang", title = "An Empirical Evaluation of Columnar Storage Formats", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "2", pages = "148--161", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3626292.3626298", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 12 09:42:35 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3626292.3626298", abstract = "Columnar storage is a core component of a modern data analytics system. Although many database management systems (DBMSs) have proprietary storage formats, most provide extensive support to open-source storage formats such as Parquet and ORC to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yuan:2023:EGA, author = "Yichao Yuan and Haojie Ye and Sanketh Vedula and Wynn Kaza and Nishil Talati", title = "{Everest}: {GPU}-Accelerated System for Mining Temporal Motifs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "2", pages = "162--174", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3626292.3626299", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 12 09:42:35 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3626292.3626299", abstract = "Temporal motif mining is the task of finding the occurrences of subgraph patterns within a large input temporal graph that obey the specified structural and temporal constraints. Despite its utility in several critical application domains that demand \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2023:BSB, author = "Xueyi Wu and Yuanyuan Xu and Wenjie Zhang and Ying Zhang", title = "Billion-Scale Bipartite Graph Embedding: a Global-Local Induced Approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "2", pages = "175--183", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3626292.3626300", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 12 09:42:35 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3626292.3626300", abstract = "Bipartite graph embedding (BGE), as the fundamental task in bipartite network analysis, is to map each node to compact low-dimensional vectors that preserve intrinsic properties. The existing solutions towards BGE fall into two groups: metric-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ni:2023:UAP, author = "Wangze Ni and Pengze Chen and Lei Chen and Peng Cheng and Chen Jason Zhang and Xuemin Lin", title = "Utility-Aware Payment Channel Network Rebalance", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "2", pages = "184--196", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3626292.3626301", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 12 09:42:35 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3626292.3626301", abstract = "The payment channel network (PCN) is a promising solution to increase the throughput of blockchains. However, unidirectional transactions can deplete a user's deposits in a payment channel (PC), reducing the success ratio of transactions (SRoT). To \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:AAB, author = "Pengfei Li and Wenqing Wei and Rong Zhu and Bolin Ding and Jingren Zhou and Hua Lu", title = "{ALECE}: an Attention-based Learned Cardinality Estimator for {SPJ} Queries on Dynamic Workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "2", pages = "197--210", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3626292.3626302", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 12 09:42:35 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3626292.3626302", abstract = "For efficient query processing, DBMS query optimizers have for decades relied on delicate cardinality estimation methods. In this work, we propose an Attention-based LEarned Cardinality Estimator (ALECE for short) for SPJ queries. The core idea is to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xia:2023:FLE, author = "Haojun Xia and Zhen Zheng and Yuchao Li and Donglin Zhuang and Zhongzhu Zhou and Xiafei Qiu and Yong Li and Wei Lin and Shuaiwen Leon Song", title = "{Flash-LLM}: Enabling Cost-Effective and Highly-Efficient Large Generative Model Inference with Unstructured Sparsity", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "2", pages = "211--224", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3626292.3626303", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 12 09:42:35 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3626292.3626303", abstract = "With the fast growth of parameter size, it becomes increasingly challenging to deploy large generative models as they typically require large GPU memory consumption and massive computation. Unstructured model pruning has been a common approach to reduce \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Howard:2023:CCF, author = "Heidi Howard and Fritz Alder and Edward Ashton and Amaury Chamayou and Sylvan Clebsch and Manuel Costa and Antoine Delignat-Lavaud and C{\'e}dric Fournet and Andrew Jeffery and Matthew Kerner and Fotios Kounelis and Markus A. Kuppe and Julien Maffre and Mark Russinovich and Christoph M. Wintersteiger", title = "Confidential Consortium Framework: Secure Multiparty Applications with Confidentiality, Integrity, and High Availability", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "2", pages = "225--240", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3626292.3626304", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 12 09:42:35 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3626292.3626304", abstract = "Confidentiality, integrity protection, and high availability, abbreviated to CIA, are essential properties for trustworthy data systems. The rise of cloud computing and the growing demand for multiparty applications however means that building modern \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Duan:2023:VVL, author = "Sijing Duan and Feng Lyu and Xin Zhu and Yi Ding and Haotian Wang and Desheng Zhang and Xue Liu and Yaoxue Zhang and Ju Ren", title = "{VeLP}: Vehicle Loading Plan Learning from Human Behavior in Nationwide Logistics System", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "2", pages = "241--249", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3626292.3626305", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 12 09:42:35 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3626292.3626305", abstract = "For a nationwide logistics transportation system, it is critical to make the vehicle loading plans (i.e., given many packages, deciding vehicle types and numbers) at each sorting and distribution center. This task is currently completed by dispatchers \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Naik:2023:RQS, author = "Aaditya Naik and Aalok Thakkar and Adam Stein and Rajeev Alur and Mayur Naik", title = "Relational Query Synthesis $ \bowtie $ Decision Tree Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "2", pages = "250--263", month = oct, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3626292.3626306", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Tue Dec 12 09:42:35 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3626292.3626306", abstract = "We study the problem of synthesizing a core fragment of relational queries called select-project-join (SPJ) queries from input-output examples. Search-based synthesis techniques are suited to synthesizing projections and joins by navigating the network \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yao:2023:RRA, author = "Feng Yao and Qian Tao and Wenyuan Yu and Yanfeng Zhang and Shufeng Gong and Qiange Wang and Ge Yu and Jingren Zhou", title = "{RAGraph}: a Region-Aware Framework for Geo-Distributed Graph Processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "264--277", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632094", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632094", abstract = "In many global businesses of multinational enterprises, graph-structure data is usually geographically distributed in different regions to support low-latency services. Geo-distributed graph processing suffers from the Wide Area Networks (WANs) with \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2023:SDB, author = "Qiuru Lin and Sai Wu and Junbo Zhao and Jian Dai and Meng Shi and Gang Chen and Feifei Li", title = "{SmartLite}: a {DBMS-Based} Serving System for {DNN} Inference in Resource-Constrained Environments", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "278--291", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632095", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632095", abstract = "Many IoT applications require the use of multiple deep neural networks (DNNs) to perform various tasks on low-cost edge devices with limited computation resources. However, existing DNN model serving platforms, such as TensorFlow Serving and TorchServe, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2023:BMC, author = "Shiwen Wu and Qiyu Wu and Honghua Dong and Wen Hua and Xiaofang Zhou", title = "Blocker and Matcher Can Mutually Benefit: a Co-Learning Framework for Low-Resource Entity Resolution", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "292--304", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632096", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632096", abstract = "Entity resolution (ER) approaches typically consist of a blocker and a matcher. They share the same goal and cooperate in different roles: the blocker first quickly removes obvious non-matches, and the matcher subsequently determines whether the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ang:2023:TTS, author = "Yihao Ang and Qiang Huang and Yifan Bao and Anthony K. H. Tung and Zhiyong Huang", title = "{TSGBench}: Time Series Generation Benchmark", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "305--318", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632097", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632097", abstract = "Synthetic Time Series Generation (TSG) is crucial in a range of applications, including data augmentation, anomaly detection, and privacy preservation. Although significant strides have been made in this field, existing methods exhibit three key \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Punter:2023:OEM, author = "Wieger R. Punter and Odysseas Papapetrou and Minos Garofalakis", title = "{OmniSketch}: Efficient Multi-Dimensional High-Velocity Stream Analytics with Arbitrary Predicates", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "319--331", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632098", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632098", abstract = "A key need in different disciplines is to perform analytics over fast-paced data streams, similar in nature to the traditional OLAP analytics in relational databases --- i.e., with filters and aggregates. Storing unbounded streams, however, is not a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chung:2023:MBK, author = "Kai Hiu Chung and Alexander Zhou and Yue Wang and Lei Chen", title = "Maximum Balanced $ (k, \epsilon)$-Bitruss Detection in Signed Bipartite Graph", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "332--344", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632099", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632099", abstract = "Signed bipartite graphs represent relationships between two sets of entities, including both positive and negative interactions, allowing for a more comprehensive modeling of real-world networks. In this work, we focus on the detection of cohesive \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:MVI, author = "Xiao Li and Huan Li and Hua Lu and Christian S. Jensen and Varun Pandey and Volker Markl", title = "Missing Value Imputation for Multi-Attribute Sensor Data Streams via Message Propagation", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "345--358", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632100", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632100", abstract = "Sensor data streams occur widely in various real-time applications in the context of the Internet of Things (IoT). However, sensor data streams feature missing values due to factors such as sensor failures, communication errors, or depleted batteries. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2023:IID, author = "Yuhang Chen and Chaoyun Zhang and Minghua Ma and Yudong Liu and Ruomeng Ding and Bowen Li and Shilin He and Saravan Rajmohan and Qingwei Lin and Dongmei Zhang", title = "{ImDiffusion}: Imputed Diffusion Models for Multivariate Time Series Anomaly Detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "359--372", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632101", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632101", abstract = "Anomaly detection in multivariate time series data is of paramount importance for large-scale systems. However, accurately detecting anomalies in such data poses significant challenges due to the need for precise data modeling capability. Existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2023:CIP, author = "Dajun Sun and Wei Dong and Ke Yi", title = "Confidence Intervals for Private Query Processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "373--385", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632102", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632102", abstract = "Whenever randomness is involved in query processing, confidence intervals are commonly returned to the user to indicate the statistical significance of the query answer. However, this problem has not been explicitly addressed under differential privacy, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liang:2023:SBF, author = "Zhiyu Liang and Jianfeng Zhang and Chen Liang and Hongzhi Wang and Zheng Liang and Lujia Pan", title = "A Shapelet-Based Framework for Unsupervised Multivariate Time Series Representation Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "386--399", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632103", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632103", abstract = "Recent studies have shown great promise in unsupervised representation learning (URL) for multivariate time series, because URL has the capability in learning generalizable representation for many downstream tasks without using inaccessible labels. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2023:FSE, author = "Letong Wang and Xiangyun Ding and Yan Gu and Yihan Sun", title = "Fast and Space-Efficient Parallel Algorithms for Influence Maximization", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "400--413", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632104", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632104", abstract = "Influence Maximization (IM) is a crucial problem in data science. The goal is to find a fixed-size set of highly influential seed vertices on a network to maximize the influence spread along the edges. While IM is NP-hard on commonly used diffusion \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2023:TEF, author = "Yile Chen and Gao Cong and Cuauhtemoc Anda", title = "{TERI}: an Effective Framework for Trajectory Recovery with Irregular Time Intervals", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "414--426", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632105", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632105", abstract = "The proliferation of trajectory data has facilitated various applications in urban spaces, such as travel time estimation, traffic monitoring, and flow prediction. These applications require a substantial volume of high-quality trajectories as the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2023:DGS, author = "Yuhan Chen and Haojie Ye and Sanketh Vedula and Alex Bronstein and Ronald Dreslinski and Trevor Mudge and Nishil Talati", title = "Demystifying Graph Sparsification Algorithms in Graph Properties Preservation", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "427--440", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632106", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632106", abstract = "Graph sparsification is a technique that approximates a given graph by a sparse graph with a subset of vertices and/or edges. The goal of an effective sparsification algorithm is to maintain specific graph properties relevant to the downstream task while \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cao:2023:GDS, author = "Jiashen Cao and Rathijit Sen and Matteo Interlandi and Joy Arulraj and Hyesoon Kim", title = "{GPU} Database Systems Characterization and Optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "441--454", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632107", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632107", abstract = "GPUs offer massive parallelism and high-bandwidth memory access, making them an attractive option for accelerating data analytics in database systems. However, while modern GPUs possess more resources than ever before (e.g., higher DRAM bandwidth), \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2023:NDG, author = "Chaoyi Chen and Dechao Gao and Yanfeng Zhang and Qiange Wang and Zhenbo Fu and Xuecang Zhang and Junhua Zhu and Yu Gu and Ge Yu", title = "{NeutronStream}: a Dynamic {GNN} Training Framework with Sliding Window for Graph Streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "455--468", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632108", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632108", abstract = "Existing Graph Neural Network (GNN) training frameworks have been designed to help developers easily create performant GNN implementations. However, most existing GNN frameworks assume that the input graphs are static, but ignore that most real-world \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hildred:2023:CLL, author = "Joshua Hildred and Michael Abebe and Khuzaima Daudjee", title = "{Caerus}: Low-Latency Distributed Transactions for Geo-Replicated Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "469--482", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632109", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632109", abstract = "Distributed deterministic database systems achieve high transaction throughput for geographically replicated data. Supporting transactions with ACID guarantees requires deterministic databases to order transactions globally to dictate execution order. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:EEA, author = "Aoqian Zhang and Shuqing Deng and Dongping Cui and Ye Yuan and Guoren Wang", title = "An Experimental Evaluation of Anomaly Detection in Time Series", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "483--496", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632110", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632110", abstract = "Anomaly detection in time series data has been studied for decades in both statistics and computer science. Various algorithms have been proposed for different scenarios, such as fraud detection, environmental monitoring, manufacturing, and healthcare. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Singh:2023:FAE, author = "Mukul Singh and Jos{\'e} Cambronero and Sumit Gulwani and Vu Le and Carina Negreanu and Elnaz Nouri and Mohammad Raza and Gust Verbruggen", title = "{FormaT5}: Abstention and Examples for Conditional Table Formatting with Natural Language", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "497--510", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632111", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632111", abstract = "Formatting is an important property in tables for visualization, presentation, and analysis. Spreadsheet software allows users to automatically format their tables by writing data-dependent conditional formatting (CF) rules. Writing such rules is often \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schonberger:2023:QID, author = "Manuel Sch{\"o}nberger and Immanuel Trummer and Wolfgang Mauerer", title = "Quantum-Inspired Digital Annealing for Join Ordering", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "511--524", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632112", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632112", abstract = "Finding the optimal join order (JO) is one of the most important problems in query optimisation, and has been extensively considered in research and practise. As it involves huge search spaces, approximation approaches and heuristics are commonly used, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Musleh:2023:KSB, author = "Mashaal Musleh and Mohamed F. Mokbel", title = "{Kamel}: a Scalable {BERT}-Based System for Trajectory Imputation", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "525--538", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632113", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632113", abstract = "Numerous important applications rely on detailed trajectory data. Yet, unfortunately, trajectory datasets are typically sparse with large spatial and temporal gaps between each two points, which is a major hurdle for their accuracy. This paper presents \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:ETL, author = "Xinyi Zhang and Hong Wu and Yang Li and Zhengju Tang and Jian Tan and Feifei Li and Bin Cui", title = "An Efficient Transfer Learning Based Configuration Adviser for Database Tuning", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "539--552", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632114", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632114", abstract = "In recent years, a wide spectrum of database tuning systems have emerged to automatically optimize database performance. However, these systems require a significant number of workload runs to deliver a satisfactory level of database performance, which \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Petralia:2023:ATT, author = "Adrien Petralia and Philippe Charpentier and Themis Palpanas", title = "{ADF \& TransApp}: a Transformer-Based Framework for Appliance Detection Using Smart Meter Consumption Series", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "553--562", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632115", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632115", abstract = "Over the past decade, millions of smart meters have been installed by electricity suppliers worldwide, allowing them to collect a large amount of electricity consumption data, albeit sampled at a low frequency (one point every 30min). One of the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wooders:2023:RAA, author = "Sarah Wooders and Xiangxi Mo and Amit Narang and Kevin Lin and Ion Stoica and Joseph M. Hellerstein and Natacha Crooks and Joseph E. Gonzalez", title = "{RALF}: Accuracy-Aware Scheduling for Feature Store Maintenance", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "563--576", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632116", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632116", abstract = "Feature stores (also sometimes referred to as embedding stores) are becoming ubiquitous in model serving systems: downstream applications query these stores for auxiliary inputs at inference-time. Stored features are derived by featurizing rapidly \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2023:ALH, author = "Kaisong Huang and Tianzheng Wang and Qingqing Zhou and Qingzhong Meng", title = "The Art of Latency Hiding in Modern Database Engines", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "577--590", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632117", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632117", abstract = "Modern database engines must well use multicore CPUs, large main memory and fast storage devices to achieve high performance. A common theme is hiding latencies such that more CPU cycles can be dedicated to ``real'' work, improving overall throughput. Yet \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Najafi:2023:MSN, author = "Mohammad Matin Najafi and Chenhao Ma and Xiaodong Li and Reynold Cheng and Laks V. S. Lakshmanan", title = "{MOSER}: Scalable Network {Motif} Discovery Using Serial Test", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "591--603", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632118", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632118", abstract = "Given a graph G, a motif (e.g., 3-node clique) is a fundamental building block for G. Recently, motif-based graph analysis has attracted much attention due to its efficacy in tasks such as clustering, ranking, and link prediction. These tasks require \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:CMP, author = "Dongxiang Zhang and Teng Ma and Junnan Hu and Yijun Bei and Kian-Lee Tan and Gang Chen", title = "Co-Movement Pattern Mining from Videos", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "3", pages = "604--616", month = nov, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3632093.3632119", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:36:59 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3632093.3632119", abstract = "Co-movement pattern mining from GPS trajectories has been an intriguing subject in spatial-temporal data mining. In this paper, we extend this research line by migrating the data source from GPS sensors to surveillance cameras, and presenting the first \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ge:2023:EAS, author = "Qian Ge and Yu Liu and Yinghao Zhao and Yuetian Sun and Lei Zou and Yuxing Chen and Anqun Pan", title = "Efficient and Accurate {SimRank}-Based Similarity Joins: Experiments, Analysis, and Improvement", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "617--629", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636219", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636219", abstract = "SimRank-based similarity joins, which mainly include threshold-based and top- k similarity joins, are important types of all-pair SimRank queries. Although a line of related algorithms have been proposed recently, they still fall short of providing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2023:ERN, author = "Wentao Li and Maolin Cai and Min Gao and Dong Wen and Lu Qin and Wei Wang", title = "Expanding Reverse Nearest Neighbors", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "630--642", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636220", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636220", abstract = "In a graph, the reverse nearest neighbors (RNN) of vertex f refer to the set of vertices that consider f as their nearest neighbor. When f represents a facility like a subway station, its RNN comprises potential users who prefer the nearest facility. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2023:ESO, author = "Fuheng Zhao and Divyakant Agrawal and Amr {El Abbadi} and Ahmed Metwally and Claire Mathieu and Michel de Rougemont", title = "Errata for {``SpaceSaving$ \pm $: an Optimal Algorithm for Frequency Estimation and Frequent Items in the Bounded-Deletion Model''}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "643--643", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636221", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", note = "See \cite{Zhao:2022:SPO}.", URL = "https://dl.acm.org/doi/10.14778/3636218.3636221", abstract = "This errata article points out an implicit assumption in the work of four of us published in VLDB 2022. The SpaceSaving\pm algorithm in bounded deletion data stream presented in the paper implicitly assumed deletions happen after all insertions. When \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Siddiqui:2023:CET, author = "Tarique Siddiqui and Vivek Narasayya and Marius Dumitru and Surajit Chaudhuri", title = "Cache-Efficient Top-$k$ Aggregation over High Cardinality Large Datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "644--656", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636222", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636222", abstract = "Top-k aggregation queries are widely used in data analytics for summarizing and identifying important groups from large amounts of data. These queries are usually processed by first computing exact aggregates for all groups and then selecting the groups \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cai:2023:ETB, author = "Xinwei Cai and Xiangyu Ke and Kai Wang and Lu Chen and Tianming Zhang and Qing Liu and Yunjun Gao", title = "Efficient Temporal Butterfly Counting and Enumeration on Temporal Bipartite Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "657--670", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636223", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636223", abstract = "Bipartite graphs characterize relationships between two different sets of entities, like actor-movie, user-item, and author-paper. The butterfly, a 4-vertices 4-edges (2,2)-biclique, is the simplest cohesive motif in a bipartite graph and is the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhong:2023:TTB, author = "Tianxiong Zhong and Zhiwei Zhang and Guo Lu and Ye Yuan and Yu-Ping Wang and Guoren Wang", title = "{TVM}: a Tile-based Video Management Framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "671--684", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636224", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636224", abstract = "With the exponential growth of video data, there is a pressing need for efficient video analysis technology. Modern query frameworks aim to accelerate queries by reducing the frequency of calls to expensive deep neural networks, which often overlook the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:SCR, author = "Yi Zhang and Jan Deriu and George Katsogiannis-Meimarakis and Catherine Kosten and Georgia Koutrika and Kurt Stockinger", title = "{ScienceBenchmark}: a Complex Real-World Benchmark for Evaluating Natural Language to {SQL} Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "685--698", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636225", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636225", abstract = "Natural Language to SQL systems (NL-to-SQL) have recently shown improved accuracy (exceeding 80\%) for natural language to SQL query translation due to the emergence of transformer-based language models, and the popularity of the Spider benchmark. However,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2023:DMS, author = "Lu Chen and Chengfei Liu and Rui Zhou and Kewen Liao and Jiajie Xu and Jianxin Li", title = "Densest Multipartite Subgraph Search in Heterogeneous Information Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "699--711", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636226", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636226", abstract = "Cohesive multipartite subgraphs (CMS) in heterogeneous information networks (HINs) uncover closely connected vertex groups of multiple types, enhancing real applications like community search and anomaly detection. However, existing works for HINs pay \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Nagrecha:2023:SOD, author = "Kabir Nagrecha and Arun Kumar", title = "{Saturn}: an Optimized Data System for Multi-Large-Model Deep Learning Workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "712--725", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636227", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636227", abstract = "Large models such as GPT-3 and ChatGPT have transformed deep learning (DL), powering applications that have captured the public's imagination. Such models must be trained on multiple GPUs due to their size and computational load, driving the development \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cai:2023:BTF, author = "Miao Cai and Junru Shen and Yifan Yuan and Zhihao Qu and Baoliu Ye", title = "{BonsaiKV}: Towards Fast, Scalable, and Persistent Key--Value Stores with Tiered, Heterogeneous Memory System", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "726--739", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636228", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636228", abstract = "Emerging NUMA/CXL-based tiered memory systems with heterogeneous memory devices such as DRAM and NVMM deliver ultrafast speed, large capacity, and data persistence all at once, offering great promise to high-performance in-memory key-value stores. To \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Reiner:2023:SEC, author = "Silvan Reiner and Michael Grossniklaus", title = "Sample-Efficient Cardinality Estimation Using Geometric Deep Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "740--752", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636229", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636229", abstract = "In database systems, accurate cardinality estimation is a cornerstone of effective query optimization. In this context, estimators that use machine learning have shown significant promise. Despite their potential, the effectiveness of these learned \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2023:MTS, author = "Kai Zhao and Chenjuan Guo and Yunyao Cheng and Peng Han and Miao Zhang and Bin Yang", title = "Multiple Time Series Forecasting with Dynamic Graph Modeling", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "753--765", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636230", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636230", abstract = "Multiple time series forecasting plays an essential role in many applications. Solutions based on graph neural network (GNN) that deliver state-of-the-art forecasting performance use the relation graph which can capture historical correlations among time \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cheng:2023:WGA, author = "Yunyao Cheng and Peng Chen and Chenjuan Guo and Kai Zhao and Qingsong Wen and Bin Yang and Christian S. Jensen", title = "Weakly Guided Adaptation for Robust Time Series Forecasting", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "766--779", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636231", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636231", abstract = "Robust multivariate time series forecasting is crucial in many cyberphysical and Internet of Things applications. Existing state-of-the-art robust forecasting models decompose time series into independent functions covering trends and periodicities. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2023:ACA, author = "Rui Yang and Evgenios M. Kornaropoulos and Yue Cheng", title = "Algorithmic Complexity Attacks on Dynamic Learned Indexes", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "780--793", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636232", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636232", abstract = "Learned Index Structures (LIS) view a sorted index as a model that learns the data distribution, takes a data element key as input, and outputs the predicted position of the key. The original LIS can only handle lookup operations with no support for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2023:MDC, author = "Jiaqi Zhu and Shaofeng Cai and Fang Deng and Beng Chin Ooi and Wenqiao Zhang", title = "{METER}: a Dynamic Concept Adaptation Framework for Online Anomaly Detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "794--807", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636233", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636233", abstract = "Real-time analytics and decision-making require online anomaly detection (OAD) to handle drifts in data streams efficiently and effectively. Unfortunately, existing approaches are often constrained by their limited detection capacity and slow adaptation \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2023:EAL, author = "Hailin Zhang and Penghao Zhao and Xupeng Miao and Yingxia Shao and Zirui Liu and Tong Yang and Bin Cui", title = "Experimental Analysis of Large-Scale Learnable Vector Storage Compression", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "808--822", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636234", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636234", abstract = "Learnable embedding vector is one of the most important applications in machine learning, and is widely used in various database-related domains. However, the high dimensionality of sparse data in recommendation tasks and the huge volume of corpus in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2023:CSC, author = "Yue Zhao and Zhaodonghui Li and Gao Cong", title = "A Comparative Study and Component Analysis of Query Plan Representation Techniques in {ML4DB} Studies", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "823--835", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636235", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636235", abstract = "Query plan is widely used as input in machine learning for databases (ML4DB) research, with query plan representation as a critical step. However, existing studies typically focus on one task, and propose a novel design to represent query plans along \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhuang:2023:TGD, author = "Zeyang Zhuang and Penghui Li and Pingchuan Ma and Wei Meng and Shuai Wang", title = "Testing Graph Database Systems via Graph-Aware Metamorphic Relations", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "836--848", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636236", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636236", abstract = "Graph database systems (GDBs) have supported many important real-world applications such as social networks, logistics, and path planning. Meanwhile, logic bugs are also prevalent in GDBs, leading to incorrect results and severe consequences. However, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cong:2023:OCE, author = "Tianji Cong and Madelon Hulsebos and Zhenjie Sun and Paul Groth and H. V. Jagadish", title = "{Observatory}: Characterizing Embeddings of Relational Tables", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "849--862", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636237", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636237", abstract = "Language models and specialized table embedding models have recently demonstrated strong performance on many tasks over tabular data. Researchers and practitioners are keen to leverage these models in many new application contexts; but limited \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kim:2023:FAD, author = "Taeyoon Kim and ChanHo Park and Mansur Mukimbekov and Heelim Hong and Minseok Kim and Ze Jin and Changdae Kim and Ji-Yong Shin and Myeongjae Jeon", title = "{FusionFlow}: Accelerating Data Preprocessing for Machine Learning with {CPU--GPU} Cooperation", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "863--876", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636238", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636238", abstract = "Data augmentation enhances the accuracy of DL models by diversifying training samples through a sequence of data transformations. While recent advancements in data augmentation have demonstrated remarkable efficacy, they often rely on computationally \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mohr-Daurat:2023:BAD, author = "Hubert Mohr-Daurat and Xuan Sun and Holger Pirk", title = "{BOSS} --- an Architecture for Database Kernel Composition", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "877--890", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636239", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636239", abstract = "Composable Database System Research has yielded components such as Apache Arrow for Storage, Meta's Velox for processing and Apache Calcite for query planning. What is lacking, however, is a design for a general, efficient and easy-to-use architecture to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhi:2023:CBC, author = "Xiangyu Zhi and Xiao Yan and Bo Tang and Ziyao Yin and Yanchao Zhu and Minqi Zhou", title = "{CoroGraph}: Bridging Cache Efficiency and Work Efficiency for Graph Algorithm Execution", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "891--903", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636240", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636240", abstract = "Many systems are designed to run graph algorithms efficiently in memory but they achieve only cache efficiency or work efficiency. We tackle this fundamental trade-off in existing systems by designing CoroGraph, a system that attains both cache \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cheng:2023:MSO, author = "Audrey Cheng and Jack Waudby and Hugo Firth and Natacha Crooks and Ion Stoica", title = "Mammoths are Slow: The Overlooked Transactions of Graph Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "904--911", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636241", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636241", abstract = "This paper argues for better concurrency control to support mammoth transactions, which read and write to many items. While these requests are prevalent on graph data, few systems support them efficiently. Currently, developers must make the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2023:VVS, author = "Enyuan Zhou and Song Guo and Zicong Hong and Christian S. Jensen and Yang Xiao and Dalin Zhang and Jinwen Liang and Qingqi Pei", title = "{VeriDKG}: a Verifiable {SPARQL} Query Engine for Decentralized Knowledge Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "4", pages = "912--925", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.14778/3636218.3636242", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Mar 20 07:37:01 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3636218.3636242", abstract = "The ability to decentralize knowledge graphs (KG) is important to exploit the full potential of the Semantic Web and realize the Web 3.0 vision. However, decentralization also renders KGs more prone to attacks with adverse effects on data integrity and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Weng:2024:EEP, author = "Lianggui Weng and Rong Zhu and Di Wu and Bolin Ding and Bolong Zheng and Jingren Zhou", title = "{Eraser}: Eliminating Performance Regression on Learned Query Optimizer", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "926--938", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641205", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641205", abstract = "Efficient query optimization is crucial for database management systems. Recently, machine learning models have been applied in query optimizers to generate better plans, but the unpredictable performance regressions prevent them from being truly \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:HNB, author = "Chao Zhang and Guoliang Li and Tao Lv", title = "{HyBench}: a New Benchmark for {HTAP} Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "939--951", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641206", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641206", abstract = "In this paper, we propose, HyBench, a new benchmark for HTAP databases. First, we generate the testing data by simulating a representative HTAP application. We particularly develop a time-dependent generation phase and an anomaly generation phase for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tae:2024:FFA, author = "Ki Hyun Tae and Hantian Zhang and Jaeyoung Park and Kexin Rong and Steven Euijong Whang", title = "{Falcon}: Fair Active Learning Using Multi-Armed Bandits", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "952--965", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641207", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641207", abstract = "Biased data can lead to unfair machine learning models, highlighting the importance of embedding fairness at the beginning of data analysis, particularly during dataset curation and labeling. In response, we propose Falcon, a scalable fair active \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2024:BSC, author = "Honghu Wu and Xiangrong Zhu and Wei Hu", title = "A Blockchain System for Clustered Federated Learning with Peer-to-Peer Knowledge Transfer", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "966--979", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641208", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641208", abstract = "Federated Learning (FL) is a novel distributed, privacy-preserving machine learning paradigm. Conventional FL suffers from drawbacks such as single point of failure and client drift. Blockchain is a distributed computing architecture famous for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2024:PSD, author = "Rong Zhu and Lianggui Weng and Wenqing Wei and Di Wu and Jiazhen Peng and Yifan Wang and Bolin Ding and Defu Lian and Bolong Zheng and Jingren Zhou", title = "{PilotScope}: Steering Databases with Machine Learning Drivers", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "980--993", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641209", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641209", abstract = "Learned databases, or AI4DB techniques, have rapidly developed in the last decade. Deploying machine learning (ML) and AI4DB algorithms into actual databases is the gold standard to examine their performance in practice. However, due to the complexity of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:TSO, author = "Yishuai Li and Yunfeng Zhu and Chao Shi and Guanhua Zhang and Jianzhong Wang and Xiaolu Zhang", title = "Timestamp as a Service, Not an Oracle", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "994--1006", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641210", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641210", abstract = "We present a logical timestamping mechanism for ordering transactions in distributed databases, eliminating the single point of failure (SPoF) that bother existing timestamp ``oracles''. The main innovation is a bipartite client-server architecture, where \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xing:2024:DDI, author = "Junjie Xing and Xinyu Wang and H. V. Jagadish", title = "Data-Driven Insight Synthesis for Multi-Dimensional Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "1007--1019", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641211", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641211", abstract = "Exploratory data analysis can uncover interesting data insights from data. Current methods utilize ``interestingness measures'' designed based on system designers' perspectives, thus inherently restricting the insights to their defined scope. These systems,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xing:2024:DNM, author = "Naili Xing and Shaofeng Cai and Gang Chen and Zhaojing Luo and Beng Chin Ooi and Jian Pei", title = "Database Native Model Selection: Harnessing Deep Neural Networks in Database Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "1020--1033", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641212", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641212", abstract = "The growing demand for advanced analytics beyond statistical aggregation calls for database systems that support effective model selection of deep neural networks (DNNs). However, existing model selection strategies are based on either training-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2024:QSD, author = "Kaiyu Chen and Dong Wen and Wenjie Zhang and Ying Zhang and Xiaoyang Wang and Xuemin Lin", title = "Querying Structural Diversity in Streaming Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "1034--1046", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641213", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641213", abstract = "Structural diversity of a vertex refers to the diversity of connections within its neighborhood and has been applied in various fields such as viral marketing and user engagement. The paper studies querying the structural diversity of a vertex for any \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gou:2024:LSE, author = "Xiangyang Gou and Xinyi Ye and Lei Zou and Jeffrey Xu Yu", title = "{LM-SRPQ}: Efficiently Answering Regular Path Query in Streaming Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "1047--1059", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641214", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641214", abstract = "Regular path query (RPQ) is a basic operation for graph data analysis, and persistent RPQ in streaming graphs is a new-emerging research topic. In this paper, we propose a novel algorithm for persistent RPQ in streaming graphs, named LM-SRPQ. It solves \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gao:2024:EET, author = "Shihong Gao and Yiming Li and Yanyan Shen and Yingxia Shao and Lei Chen", title = "{ETC}: Efficient Training of Temporal Graph Neural Networks over Large-Scale Dynamic Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "1060--1072", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641215", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641215", abstract = "Dynamic graphs play a crucial role in various real-world applications, such as link prediction and node classification on social media and e-commerce platforms. Temporal Graph Neural Networks (T-GNNs) have emerged as a leading approach for handling \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2024:TFS, author = "Chenyuan Wu and Mohammad Javad Amiri and Haoyun Qin and Bhavana Mehta and Ryan Marcus and Boon Thau Loo", title = "Towards Full Stack Adaptivity in Permissioned Blockchains", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "1073--1080", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641216", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641216", abstract = "This paper articulates our vision for a learning-based untrustworthy distributed database. We focus on permissioned blockchain systems as an emerging instance of untrustworthy distributed databases and argue that as novel smart contracts, modern hardware,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Han:2024:BLC, author = "Jindong Han and Weijia Zhang and Hao Liu and Tao Tao and Naiqiang Tan and Hui Xiong", title = "{BigST}: Linear Complexity Spatio-Temporal Graph Neural Network for Traffic Forecasting on Large-Scale Road Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "1081--1090", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641217", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641217", abstract = "Spatio-Temporal Graph Neural Network (STGNN) has been used as a common workhorse for traffic forecasting. However, most of them require prohibitive quadratic computational complexity to capture long-range spatio-temporal dependencies, thus hindering \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Min:2024:SWO, author = "Xinhao Min and Kai Lu and Pengyu Liu and Jiguang Wan and Changsheng Xie and Daohui Wang and Ting Yao and Huatao Wu", title = "{SepHash}: a Write-Optimized Hash Index On Disaggregated Memory via Separate Segment Structure", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "1091--1104", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641218", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641218", abstract = "Disaggregated memory separates compute and memory resources into independent pools connected by fast RDMA (Remote Direct Memory Access) networks, which can improve memory utilization, reduce cost, and enable elastic scaling of compute and memory \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tang:2024:XBM, author = "Dahai Tang and Jiali Wang and Rong Chen and Lei Wang and Wenyuan Yu and Jingren Zhou and Kenli Li", title = "{XGNN}: Boosting Multi-{GPU} {GNN} Training via Global {GNN} Memory Store", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "1105--1118", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641219", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641219", abstract = "GPUs are commonly utilized to accelerate GNN training, particularly on a multi-GPU server with high-speed interconnects (e.g., NVLink and NVSwitch). However, the rapidly increasing scale of graphs poses a challenge to applying GNN to real-world \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tao:2024:CEP, author = "Youming Tao and Cheng-Long Wang and Miao Pan and Dongxiao Yu and Xiuzhen Cheng and Di Wang", title = "Communication Efficient and Provable Federated Unlearning", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "1119--1131", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641220", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641220", abstract = "We study federated unlearning, a novel problem to eliminate the impact of specific clients or data points on the global model learned via federated learning (FL). This problem is driven by the right to be forgotten and the privacy challenges in FL. We \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gao:2024:TSE, author = "Dawei Gao and Haibin Wang and Yaliang Li and Xiuyu Sun and Yichen Qian and Bolin Ding and Jingren Zhou", title = "Text-to-{SQL} Empowered by Large Language Models: a Benchmark Evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "1132--1145", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641221", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641221", abstract = "Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL task. However, the absence of a systematical benchmark inhibits the development of designing effective, efficient and economic LLM-based Text-to-SQL solutions. To address this \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mai:2024:SPQ, author = "Anh L. Mai and Pengyu Wang and Azza Abouzied and Matteo Brucato and Peter J. Haas and Alexandra Meliou", title = "Scaling Package Queries to a Billion Tuples via Hierarchical Partitioning and Customized Optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "5", pages = "1146--1158", month = jan, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3641204.3641222", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:34 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3641204.3641222", abstract = "A package query returns a package---a multiset of tuples---that maximizes or minimizes a linear objective function subject to linear constraints, thereby enabling in-database decision support. Prior work has established the equivalence of package queries \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Deng:2024:MIM, author = "Yuhao Deng and Chengliang Chai and Lei Cao and Nan Tang and Jiayi Wang and Ju Fan and Ye Yuan and Guoren Wang", title = "{MisDetect}: Iterative Mislabel Detection using Early Loss", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1159--1172", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648161", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648161", abstract = "Supervised machine learning (ML) models trained on data with mislabeled instances often produce inaccurate results due to label errors. Traditional methods of detecting mislabeled instances rely on data proximity, where an instance is considered \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2024:CMA, author = "Wenfei Fan and Muyang Liu and Shuhao Liu and Chao Tian", title = "Capturing More Associations by Referencing External Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1173--1186", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648162", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648162", abstract = "This paper studies association rule discovery in a graph $ G_1 $ by referencing an external graph $ G_2 $ with overlapping information. The objective is to enrich $ G_1 $ with relevant properties and links from $ G_2 $. As a testbed, we consider Graph Association Rules \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2024:QEQ, author = "Longlong Lin and Pingpeng Yuan and Rong-Hua Li and Chunxue Zhu and Hongchao Qin and Hai Jin and Tao Jia", title = "{QTCS}: Efficient Query-Centered Temporal Community Search", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1187--1199", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648163", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648163", abstract = "Temporal community search is an important task in graph analysis, which has been widely used in many practical applications. However, existing methods suffer from two major defects: (i) they only require that the target result contains the query vertex q,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fu:2024:DAD, author = "Jie Fu and Qingqing Ye and Haibo Hu and Zhili Chen and Lulu Wang and Kuncan Wang and Xun Ran", title = "{DPSUR}: Accelerating Differentially Private Stochastic Gradient Descent Using Selective Update and Release", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1200--1213", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648164", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648164", abstract = "Machine learning models are known to memorize private data to reduce their training loss, which can be inadvertently exploited by privacy attacks such as model inversion and membership inference. To protect against these attacks, differential privacy (DP). \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Erben:2024:HCW, author = "Alexander Erben and Ruben Mayer and Hans-Arno Jacobsen", title = "How Can We Train Deep Learning Models Across Clouds and Continents? {An} Experimental Study", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1214--1226", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648165", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648165", abstract = "This paper aims to answer the question: Can deep learning models be cost-efficiently trained on a global market of spot VMs spanning different data centers and cloud providers? To provide guidance, we extensively evaluate the cost and throughput \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Park:2024:ASA, author = "Jeongmin Brian Park and Vikram Sharma Mailthody and Zaid Qureshi and Wen-mei Hwu", title = "Accelerating Sampling and Aggregation Operations in {GNN} Frameworks with {GPU} Initiated Direct Storage Accesses", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1227--1240", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648166", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648166", abstract = "Graph Neural Networks (GNNs) are emerging as a powerful tool for learning from graph-structured data and performing sophisticated inference tasks in various application domains. Although GNNs have been shown to be effective on modest-sized graphs, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yuan:2024:CEG, author = "Hao Yuan and Yajiong Liu and Yanfeng Zhang and Xin Ai and Qiange Wang and Chaoyi Chen and Yu Gu and Ge Yu", title = "Comprehensive Evaluation of {GNN} Training Systems: a Data Management Perspective", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1241--1254", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648167", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648167", abstract = "Many Graph Neural Network (GNN) training systems have emerged recently to support efficient GNN training. Since GNNs embody complex data dependencies between training samples, the training of GNNs should address distinct challenges different from DNN \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chan:2024:LFH, author = "Tsz Nam Chan and Rui Zang and Bojian Zhu and Leong Hou U. and Dingming Wu and Jianliang Xu", title = "{LION}: Fast and High-Resolution Network Kernel Density Visualization", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1255--1268", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648168", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648168", abstract = "Network Kernel Density Visualization (NKDV) has often been used in a wide range of applications, e.g., criminology, transportation science, and urban planning. However, NKDV is computationally expensive, which cannot be scalable to large-scale datasets \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:PBP, author = "Zitao Li and Bolin Ding and Liuyi Yao and Yaliang Li and Xiaokui Xiao and Jingren Zhou", title = "Performance-Based Pricing for Federated Learning via Auction", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1269--1282", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648169", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648169", abstract = "Many machine learning techniques rely on plenty of training data. However, data are often possessed unequally by different entities, with a large proportion of data being held by a small number of data-rich entities. It can be challenging to incentivize \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Diao:2024:OIO, author = "Yiqun Diao and Yutong Yang and Qinbin Li and Bingsheng He and Mian Lu", title = "{OEBench}: Investigating Open Environment Challenges in Real-World Relational Data Streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1283--1296", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648170", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648170", abstract = "How to get insights from relational data streams in a timely manner is a hot research topic. Data streams can present unique challenges, such as distribution drifts, outliers, emerging classes, and changing features, which have recently been described as \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xie:2024:IMV, author = "Jiadong Xie and Zehua Chen and Deming Chu and Fan Zhang and Xuemin Lin and Zhihong Tian", title = "Influence Maximization via Vertex Countering", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1297--1309", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648171", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648171", abstract = "Competitive viral marketing considers the product competition of multiple companies, where each user may adopt one product and propagate the product to other users. Existing studies focus on a traditional seeding strategy where a company only selects \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:ODA, author = "Tingting Wang and Shixun Huang and Zhifeng Bao and J. Shane Culpepper and Volkan Dedeoglu and Reza Arablouei", title = "Optimizing Data Acquisition to Enhance Machine Learning Performance", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1310--1323", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648172", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648172", abstract = "In this paper, we study how to acquire labeled data points from a large data pool to enrich a training set for enhancing supervised machine learning (ML) performance. The state-of-the-art solution is the clustering-based training set selection (CTS) \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2024:MSC, author = "Xin Chen and Jieming Shi and You Peng and Wenqing Lin and Sibo Wang and Wenjie Zhang", title = "Minimum Strongly Connected Subgraph Collection in Dynamic Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1324--1336", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648173", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648173", abstract = "Real-world directed graphs are dynamically changing, and it is important to identify and maintain the strong connectivity information between nodes, which is useful in numerous applications. Given an input graph G, we study a new problem, minimum \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2024:FDF, author = "Junhao Zhu and Yuren Mao and Lu Chen and Congcong Ge and Ziheng Wei and Yunjun Gao", title = "{FusionQuery}: On-demand Fusion Queries over Multi-source Heterogeneous Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1337--1349", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648174", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648174", abstract = "Centralised data management systems (e.g., data lakes) support queries over multi-source heterogeneous data. However, the query results from multiple sources commonly involve between-source conflicts, which makes query results unreliable and confusing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Justen:2024:PAN, author = "David Justen and Daniel Ritter and Campbell Fraser and Andrew Lamb and Allison Lee and Thomas Bodner and Mhd Yamen Haddad and Steffen Zeuch and Volker Markl and Matthias Boehm", title = "{POLAR}: Adaptive and Non-invasive Join Order Selection via Plans of Least Resistance", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1350--1363", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648175", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648175", abstract = "Join ordering and query optimization are crucial for query performance but remain challenging due to unknown or changing characteristics of query intermediates, especially for complex queries with many joins. Over the past two decades, a spectrum of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:DAG, author = "Zhiyuan Li and Xun Jian and Yue Wang and Yingxia Shao and Lei Chen", title = "{DAHA}: Accelerating {GNN} Training with Data and Hardware Aware Execution Planning", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1364--1376", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648176", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648176", abstract = "Graph neural networks (GNNs) have been gaining a reputation for effective modeling of graph data. Yet, it is challenging to train GNNs efficiently. Many frameworks have been proposed but most of them suffer from high batch preparation cost and data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lu:2024:FSB, author = "Ziyi Lu and Qiang Cao and Hong Jiang and Yuxing Chen and Jie Yao and Anqun Pan", title = "{FluidKV}: Seamlessly Bridging the Gap between Indexing Performance and Memory-Footprint on Ultra-Fast Storage", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1377--1390", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648177", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648177", abstract = "Our extensive experiments reveal that existing key-value stores (KVSs) achieve high performance at the expense of a huge memory footprint that is often impractical or unacceptable. Even with the emerging ultra-fast byte-addressable persistent memory (PM),. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shah:2024:HDC, author = "Vraj Shah and Thomas Parashos and Arun Kumar", title = "How Do Categorical Duplicates Affect {ML}? {A} New Benchmark and Empirical Analyses", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1391--1404", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648178", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648178", abstract = "The tedious grunt work involved in data preparation (prep) before ML reduces ML user productivity. It is also a roadblock to industrial-scale cloud AutoML workflows that build ML models for millions of datasets. One important data prep step for ML is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cui:2024:CUF, author = "Pengjie Cui and Haotian Liu and Bo Tang and Ye Yuan", title = "{CGgraph}: an Ultra-Fast Graph Processing System on Modern Commodity {CPU--GPU} Co-processor", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1405--1417", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648179", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648179", abstract = "In recent years, many CPU-GPU heterogeneous graph processing systems have been developed in both academic and industrial to facilitate large-scale graph processing in various applications, e.g., social networks and biological networks. However, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2024:FCD, author = "Xinyu Chen and Jiannan Tian and Ian Beaver and Cynthia Freeman and Yan Yan and Jianguo Wang and Dingwen Tao", title = "{FCBench}: Cross-Domain Benchmarking of Lossless Compression for Floating-Point Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1418--1431", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648180", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648180", abstract = "While both the database and high-performance computing (HPC) communities utilize lossless compression methods to minimize floating-point data size, a disconnect persists between them. Each community designs and assesses methods in a domain-specific \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hurst:2024:PFA, author = "Aaron Hurst and Daniel E. Lucani and Qi Zhang", title = "{PairwiseHist}: Fast, Accurate and Space-Efficient Approximate Query Processing with Data Compression", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1432--1445", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648181", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648181", abstract = "Exponential growth in data collection is creating significant challenges for data storage and analytics latency. Approximate Query Processing (AQP) has long been touted as a solution for accelerating analytics on large datasets, however, there is still \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:MAD, author = "Huayi Zhang and Binwei Yan and Lei Cao and Samuel Madden and Elke Rundensteiner", title = "{MetaStore}: Analyzing Deep Learning Meta-Data at Scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1446--1459", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648182", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648182", abstract = "The process of training deep learning models produces a huge amount of meta-data, including but not limited to losses, hidden feature embeddings, and gradients. Model diagnosis tools have been developed to analyze losses and feature embeddings with the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lv:2024:RES, author = "Yangming Lv and Kai Zhang and Ziming Wang and Xiaodong Zhang and Rubao Lee and Zhenying He and Yinan Jing and X. Sean Wang", title = "{RTScan}: Efficient Scan with Ray Tracing Cores", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1460--1472", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648183", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648183", abstract = "Indexing is a core technique for accelerating predicate evaluation in databases. After many years of effort, the indexing performance has reached its peak on the existing hardware infrastructure. We propose to use ray tracing (RT) cores to move the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2024:FRM, author = "Kezhao Huang and Haitian Jiang and Minjie Wang and Guangxuan Xiao and David Wipf and Xiang Song and Quan Gan and Zengfeng Huang and Jidong Zhai and Zheng Zhang", title = "{FreshGNN}: Reducing Memory Access via Stable Historical Embeddings for Graph Neural Network Training", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1473--1486", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648184", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648184", abstract = "A key performance bottleneck when training graph neural network (GNN) models on large, real-world graphs is loading node features onto a GPU. Due to limited GPU memory, expensive data movement is necessary to facilitate the storage of these features on \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zheng:2024:SBA, author = "Ying Zheng and Kian-Lee Tan", title = "Sorting on Byte-Addressable Storage: The Resurgence of Tree Structure", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1487--1500", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648185", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648185", abstract = "The tree structure is notably popular for storage and indexing; however, tree-based sorting such as tree sort is rarely used in practice. Nevertheless, with the advent of byte-addressable storage (BAS), the tree structure captures our attention with its \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chatziliadis:2024:EPD, author = "Xenofon Chatziliadis and Eleni Tzirita Zacharatou and Alphan Eracar and Steffen Zeuch and Volker Markl", title = "Efficient Placement of Decomposable Aggregation Functions for Stream Processing over Large Geo-Distributed Topologies", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1501--1514", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648186", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648186", abstract = "A recent trend in stream processing is offloading the computation of decomposable aggregation functions (DAF) from cloud nodes to geo-distributed fog/edge devices to decrease latency and improve energy efficiency. However, deploying DAFs on low-end \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hou:2024:AEB, author = "Jiamin Hou and Zhanhao Zhao and Zhouyu Wang and Wei Lu and Guodong Jin and Dong Wen and Xiaoyong Du", title = "{AeonG}: an Efficient Built-in Temporal Support in Graph Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "6", pages = "1515--1527", month = feb, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3648160.3648187", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon May 6 06:22:36 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3648160.3648187", abstract = "Real-world graphs are often dynamic and evolve over time. It is crucial for storing and querying a graph's evolution in graph databases. However, existing works either suffer from high storage overhead or lack efficient temporal query support, or both. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yu:2024:RIT, author = "Tao Yu and Zhaonian Zou and Weihua Sun and Yu Yan", title = "Refactoring Index Tuning Process with Benefit Estimation", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1528--1541", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654622", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654622", abstract = "Index tuning is a challenging task aiming to improve query performance by selecting the most effective indexes for a database and a workload. Existing automatic index tuning methods typically rely on ``what-if tools'' to evaluate the benefit of an index \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:LSY, author = "Xunkai Li and Meihao Liao and Zhengyu Wu and Daohan Su and Wentao Zhang and Rong-Hua Li and Guoren Wang", title = "{LightDiC}: a Simple Yet Effective Approach for Large-Scale Digraph Representation Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1542--1551", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654623", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654623", abstract = "Most existing graph neural networks (GNNs) are limited to undirected graphs, whose restricted scope of the captured relational information hinders their expressive capabilities and deployment. Compared with undirected graphs, directed graphs (digraphs) \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kuang:2024:EDD, author = "Shulei Kuang and Honghui Yang and Zijing Tan and Shuai Ma", title = "Efficient Differential Dependency Discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1552--1564", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654624", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654624", abstract = "Differential dependencies (DDs) are proposed to specify constraints on the differences between values, where the semantics of difference can be ``similar'', ``dissimilar'' and beyond. DDs subsume functional dependencies (FDs), and find valuable applications \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lehmann:2024:YLQ, author = "Claude Lehmann and Pavel Sulimov and Kurt Stockinger", title = "Is Your Learned Query Optimizer Behaving As You Expect? {A} Machine Learning Perspective", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1565--1577", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654625", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654625", abstract = "The current boom of learned query optimizers (LQO) can be explained not only by the general continuous improvement of deep learning (DL) methods but also by the straightforward formulation of a query optimization problem (QOP) as a machine learning (ML) \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:MCK, author = "Zhuoxing Zhang and Sebastian Link", title = "Mixed Covers of Keys and Functional Dependencies for Maintaining the Integrity of Data under Updates", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1578--1590", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654626", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654626", abstract = "Covers for a set of functional dependencies (FDs) are fundamental for many areas of data management, such as integrity maintenance, query optimization, database design, and data cleaning. When declaring integrity constraints, keys enjoy native support in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Deng:2024:OSH, author = "Yuhao Deng and Yu Wang and Lei Cao and Lianpeng Qiao and Yuping Wang and Jingzhe Xu and Yizhou Yan and Samuel Madden", title = "Outlier Summarization via Human Interpretable Rules", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1591--1604", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654627", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654627", abstract = "Outlier detection is crucial for preventing financial fraud, network intrusions, and device failures. Users often expect systems to automatically summarize and interpret outlier detection results to reduce human effort and convert outliers into \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yuan:2024:NEE, author = "Haitao Yuan and Gao Cong and Guoliang Li", title = "{Nuhuo}: an Effective Estimation Model for Traffic Speed Histogram Imputation on a Road Network", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1605--1617", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654628", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654628", abstract = "Traffic speed histograms show the distribution of traffic speeds over a certain period. Traffic speed might not be recorded continuously, leading to missing histograms for some links on a road network. However, accurate imputation of missing histograms \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ravikumar:2024:IPP, author = "Deepak Ravikumar and Alex Yeo and Yiwen Zhu and Aditya Lakra and Harsha Nagulapalli and Santhosh Ravindran and Steve Suh and Niharika Dutta and Andrew Fogarty and Yoonjae Park and Sumeet Khushalani and Arijit Tarafdar and Kunal Parekh and Subru Krishnan", title = "Intelligent Pooling: Proactive Resource Provisioning in Large-scale Cloud Service", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1618--1627", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654629", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654629", abstract = "The proliferation of big data and analytic workloads has driven the need for cloud compute and cluster-based job processing. With Apache Spark, users can process terabytes of data at ease with hundreds of parallel executors. Providing low latency access \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ye:2024:EES, author = "Yutong Ye and Xiang Lian and Mingsong Chen", title = "Efficient Exact Subgraph Matching via {GNN}-Based Path Dominance Embedding", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1628--1641", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654630", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654630", abstract = "The classic problem of exact subgraph matching returns those subgraphs in a large-scale data graph that are isomorphic to a given query graph, which has gained increasing importance in many real-world applications such as social network analysis, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:LDH, author = "Zijia Wang and Haoran Liu and Chen Lin and Zhifeng Bao and Guoliang Li and Tianqing Wang", title = "Leveraging Dynamic and Heterogeneous Workload Knowledge to Boost the Performance of Index Advisors", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1642--1654", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654631", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654631", abstract = "Current index advisors often struggle to balance efficiency and effectiveness when dealing with workload shifts. This arises from ignorance of the continual similarity and distant variety in workloads. This paper proposes a novel learning-based index \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ertl:2024:UPM, author = "Otmar Ertl", title = "{UltraLogLog}: a Practical and More Space-Efficient Alternative to {HyperLogLog} for Approximate Distinct Counting", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1655--1668", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654632", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654632", abstract = "Since its invention HyperLogLog has become the standard algorithm for approximate distinct counting. Due to its space efficiency and suitability for distributed systems, it is widely used and also implemented in numerous databases. This work presents \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gong:2024:RTI, author = "Zengyang Gong and Yuxiang Zeng and Lei Chen", title = "Real-Time Insertion Operator for Shared Mobility on Time-Dependent Road Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1669--1682", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654633", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654633", abstract = "One of the most important challenges in shared mobility services ( e.g., ride-sharing and parcel delivery) is planning routes for workers by considering real road conditions. To tackle this challenge, the ``insertion operator'', which computes the optimal \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2024:XTM, author = "Dayi Fan and Rubao Lee and Xiaodong Zhang", title = "{X-TED}: Massive Parallelization of Tree Edit Distance", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1683--1696", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654634", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654634", abstract = "The tree edit distance (TED) has been found in a wide spectrum of applications in artificial intelligence, bioinformatics, and other areas, which serves as a metric to quantify the dissimilarity between two trees. As applications continue to scale in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shin:2024:CES, author = "Wonseok Shin and Siwoo Song and Kunsoo Park and Wook-Shin Han", title = "Cardinality Estimation of Subgraph Matching: a Filtering-Sampling Approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1697--1709", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654635", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654635", abstract = "Subgraph counting is a fundamental problem in understanding and analyzing graph structured data, yet computationally challenging. This calls for an accurate and efficient algorithm for Subgraph Cardinality Estimation, which is to estimate the number of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liang:2024:ERS, author = "Qi Liang and Dian Ouyang and Fan Zhang and Jianye Yang and Xuemin Lin and Zhihong Tian", title = "Efficient Regular Simple Path Queries under Transitive Restricted Expressions", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1710--1722", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654636", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654636", abstract = "There are two fundamental problems in regular simple path queries (RSPQs). One is the reachability problem which asks whether there exists a simple path between the source and the target vertex matching the given regular expression, and the other is the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhong:2024:MSD, author = "Shuhan Zhong and Sizhe Song and Weipeng Zhuo and Guanyao Li and Yang Liu and S.-H. Gary Chan", title = "A Multi-Scale Decomposition {MLP}-Mixer for Time Series Analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1723--1736", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654637", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654637", abstract = "Time series data, including univariate and multivariate ones, are characterized by unique composition and complex multi-scale temporal variations. They often require special consideration of decomposition and multi-scale modeling to analyze. Existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xia:2024:PSS, author = "Haocheng Xia and Xiang Li and Junyuan Pang and Jinfei Liu and Kui Ren and Li Xiong", title = "{P-Shapley}: {Shapley} Values on Probabilistic Classifiers", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1737--1750", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654638", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654638", abstract = "The Shapley value provides a unique approach to equitably gauge each player's contribution within a coalition and has extensive applications with various utility functions. In data valuation for machine learning, particularly for classification tasks, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{He:2024:OVS, author = "Wenjia He and Ibrahim Sabek and Yuze Lou and Michael Cafarella", title = "Optimizing Video Selection {LIMIT} Queries with Commonsense Knowledge", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1751--1764", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654639", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654639", abstract = "Video is becoming a major part of contemporary data collection. It is increasingly important to process video selection queries --- selecting videos that contain target objects. Advances in neural networks allow us to detect the objects in an image, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huo:2024:ZZT, author = "Nan Huo and Reynold Cheng and Ben Kao and Wentao Ning and Nur Al Hasan Haldar and Xiaodong Li and Jinyang Li and Mohammad Matin Najafi and Tian Li and Ge Qu", title = "{ZeroEA}: a Zero-Training Entity Alignment Framework via Pre-Trained Language Model", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1765--1774", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654640", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654640", abstract = "Entity alignment (EA), a crucial task in knowledge graph (KG) research, aims to identify equivalent entities across different KGs to support downstream tasks like KG integration, text-to-SQL, and question-answering systems. Given rich semantic \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2024:EGR, author = "Xueli Liu and Bowen Dong and Wenzhi Fu and Nannan Wu and Xin Wang and Wenjun Wang", title = "Extending Graph Rules with Oracles", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "7", pages = "1775--1787", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3654621.3654641", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri May 31 09:17:13 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3654621.3654641", abstract = "This paper proposes a class of graph rules for deducing associations between entities, referred to as Graph Rules with Oracles and denoted by GROs. As opposed to previous graph rules, GROs support oracle functions to import (a) external knowledge, and (b). \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mei:2024:FME, author = "Junyi Mei and Shixuan Sun and Chao Li and Cheng Xu and Cheng Chen and Yibo Liu and Jing Wang and Cheng Zhao and Xiaofeng Hou and Minyi Guo and Bingsheng He and Xiaoliang Cong", title = "{FlowWalker}: a Memory-Efficient and High-Performance {GPU}-Based Dynamic Graph Random Walk Framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "1788--1801", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659438", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659438", abstract = "Dynamic graph random walk (DGRW) emerges as a practical tool for capturing structural relations within a graph. Effectively executing DGRW on GPU presents certain challenges. First, existing sampling methods demand a pre-processing buffer, causing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kim:2024:ASK, author = "Minsu Kim and Jinwoo Hwang and Guseul Heo and Seiyeon Cho and Divya Mahajan and Jongse Park", title = "Accelerating String-Key Learned Index Structures via Memoization-Based Incremental Training", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "1802--1815", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659439", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659439", abstract = "Learned indexes use machine learning models to learn the mappings between keys and their corresponding positions in key-value indexes. These indexes use the mapping information as training data. Learned indexes require frequent retrainings of their \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liao:2024:TBC, author = "Xuankun Liao and Qing Liu and Xin Huang and Jianliang Xu", title = "Truss-Based Community Search over Streaming Directed Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "1816--1829", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659440", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659440", abstract = "Community search aims to retrieve dense subgraphs that contain the query vertices. While many effective community models and algorithms have been proposed in the literature, none of them address the unique challenges posed by streaming graphs, where \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Salazar-Diaz:2024:IDM, author = "Ricardo Salazar-D{\'\i}az and Boris Glavic and Tilmann Rabl", title = "{InferDB}: In-Database Machine Learning Inference Using Indexes", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "1830--1842", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659441", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659441", abstract = "The performance of inference with machine learning (ML) models and its integration with analytical query processing have become critical bottlenecks for data analysis in many organizations. An ML inference pipeline typically consists of a preprocessing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wei:2024:AAM, author = "Fei Wei and Ergute Bao and Xiaokui Xiao and Yin Yang and Bolin Ding", title = "{AAA}: an Adaptive Mechanism for Locally Differentially Private Mean Estimation", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "1843--1855", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659442", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659442", abstract = "Local differential privacy ( LDP ) is a strong privacy standard that has been adopted by popular software systems, including Chrome, iOS, MacOS, and Windows. The main idea is that each individual perturbs their own data locally, and only submits the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Deng:2024:AMP, author = "Yangshen Deng and Muxi Yan and Bo Tang", title = "Accelerating {Merkle} {Patricia} Trie with {GPU}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "1856--1869", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659443", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659443", abstract = "Merkle Patricia Trie (MPT) is a type of trie structure that offers efficient lookup and insert operators for immutable data systems that require multi-version access and tamper-evident controls, such as blockchains and verifiable databases. The \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:PAS, author = "Shaowei Wang and Yun Peng and Jin Li and Zikai Wen and Zhipeng Li and Shiyu Yu and Di Wang and Wei Yang", title = "Privacy Amplification via Shuffling: Unified, Simplified, and Tightened", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "1870--1883", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659444", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659444", abstract = "The shuffle model of differential privacy provides promising privacy-utility balances in decentralized, privacy-preserving data analysis. However, the current analyses of privacy amplification via shuffling lack both tightness and generality. To address \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Song:2024:DMR, author = "Jiansen Song and Wensheng Dou and Yu Gao and Ziyu Cui and Yingying Zheng and Dong Wang and Wei Wang and Jun Wei and Tao Huang", title = "Detecting Metadata-Related Logic Bugs in Database Systems via Raw Database Construction", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "1884--1897", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659445", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659445", abstract = "Database Management Systems (DBMSs) are widely used to efficiently store and retrieve data. DBMSs usually support various metadata, e.g., integrity constraints for ensuring data integrity and indexes for locating data. DBMSs can further utilize these \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2024:ZHD, author = "Biao Wu and Qiang Huang and Anthony K. H. Tung", title = "From Zero to Hero: Detecting Leaked Data through Synthetic Data Injection and Model Querying", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "1898--1910", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659446", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659446", abstract = "Safeguarding the Intellectual Property (IP) of data has become critically important as machine learning applications continue to proliferate, and their success heavily relies on the quality of training data. While various mechanisms exist to secure data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2024:OOD, author = "Guanduo Chen and Zhenying He and Meng Li and Siqiang Luo", title = "{Oasis}: an Optimal Disjoint Segmented Learned Range Filter", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "1911--1924", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659447", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659447", abstract = "The learning-enhanced data structure has inspired the development of the range filter, bringing significantly better false positive rate (FPR) than traditional non-learned range filters. Its core idea is to employ piece-wise linear functions that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Deng:2024:LBD, author = "Yuhao Deng and Chengliang Chai and Lei Cao and Qin Yuan and Siyuan Chen and Yanrui Yu and Zhaoze Sun and Junyi Wang and Jiajun Li and Ziqi Cao and Kaisen Jin and Chi Zhang and Yuqing Jiang and Yuanfang Zhang and Yuping Wang and Ye Yuan and Guoren Wang and Nan Tang", title = "{LakeBench}: a Benchmark for Discovering Joinable and Unionable Tables in Data Lakes", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "1925--1938", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659448", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659448", abstract = "Discovering tables from poorly maintained data lakes is a significant challenge in data management. Two key tasks are identifying joinable and unionable tables, crucial for data integration, analysis, and machine learning. However, there's a lack of a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lao:2024:GMR, author = "Jiale Lao and Yibo Wang and Yufei Li and Jianping Wang and Yunjia Zhang and Zhiyuan Cheng and Wanghu Chen and Mingjie Tang and Jianguo Wang", title = "{GPTuner}: a Manual-Reading Database Tuning System via {GPT}-Guided {Bayesian} Optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "1939--1952", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659449", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659449", abstract = "Modern database management systems (DBMS) expose hundreds of configurable knobs to control system behaviours. Determining the appropriate values for these knobs to improve DBMS performance is a long-standing problem in the database community. As there is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ermshaus:2024:RCS, author = "Arik Ermshaus and Patrick Sch{\"a}fer and Ulf Leser", title = "Raising the {ClaSS} of Streaming Time Series Segmentation", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "1953--1966", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659450", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659450", abstract = "Ubiquitous sensors today emit high frequency streams of numerical measurements that reflect properties of human, animal, industrial, commercial, and natural processes. Shifts in such processes, e.g. caused by external events or internal state changes, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:FLS, author = "Qiyan Li and Jeffrey Xu Yu", title = "Fast Local Subgraph Counting", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "1967--1980", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659451", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659451", abstract = "We study local subgraph counting queries, $ Q = (p, o) $, to count how many times a given $k$-node pattern graph $p$ appears around every node $ \upsilon $ in a data graph G when the given center node $o$ in $p$ maps to $ \upsilon $. Such local subgraph counting becomes important in GNNs \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:RER, author = "Yunjia Zhang and Jordan Henkel and Avrilia Floratou and Joyce Cahoon and Shaleen Deep and Jignesh M. Patel", title = "{ReAcTable}: Enhancing {ReAct} for Table Question Answering", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "1981--1994", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659452", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659452", abstract = "Table Question Answering (TQA) presents a substantial challenge at the intersection of natural language processing and data analytics. This task involves answering natural language (NL) questions on top of tabular data, demanding proficiency in logical \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ai:2024:NRS, author = "Xin Ai and Qiange Wang and Chunyu Cao and Yanfeng Zhang and Chaoyi Chen and Hao Yuan and Yu Gu and Ge Yu", title = "{NeutronOrch}: Rethinking Sample-Based {GNN} Training under {CPU--GPU} Heterogeneous Environments", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "1995--2008", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659453", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659453", abstract = "Graph Neural Networks (GNNs) have shown exceptional performance across a wide range of applications. Current frameworks leverage CPU-GPU heterogeneous environments for GNN model training, incorporating mini-batch and sampling techniques to mitigate GPU \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2024:RED, author = "Zifan Liu and Shaleen Deep and Anna Fariha and Fotis Psallidas and Ashish Tiwari and Avrilia Floratou", title = "{Rapidash}: Efficient Detection of Constraint Violations", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "2009--2021", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659454", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659454", abstract = "Denial Constraint (DC) is a well-established formalism that captures a wide range of integrity constraints commonly encountered, including candidate keys, functional dependencies, and ordering constraints, among others. Given their significance, there \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mohapatra:2024:DPD, author = "Shubhankar Mohapatra and Jianqiao Zong and Florian Kerschbaum and Xi He", title = "Differentially Private Data Generation with Missing Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "2022--2035", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659455", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659455", abstract = "Despite several works that succeed in generating synthetic data with differential privacy (DP) guarantees, they are inadequate for generating high-quality synthetic data when the input data has missing values. In this work, we formalize the problems of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Su:2024:EYA, author = "Zhaoyuan Su and Ammar Ahmed and Zirui Wang and Ali Anwar and Yue Cheng", title = "Everything You Always Wanted to Know About Storage Compressibility of Pre-Trained {ML} Models but Were Afraid to Ask", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "2036--2049", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659456", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659456", abstract = "As the number of pre-trained machine learning (ML) models is growing exponentially, data reduction tools are not catching up. Existing data reduction techniques are not specifically designed for pre-trained model (PTM) dataset files. This is largely due \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:FFF, author = "Haoyang Li and Shimin Di and Calvin Hong Yi Li and Lei Chen and Xiaofang Zhou", title = "Fight Fire with Fire: Towards Robust Graph Neural Networks on Dynamic Graphs via Actively Defense", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "2050--2063", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659457", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659457", abstract = "Graph neural networks (GNNs) have achieved great success on various graph tasks. However, recent studies have revealed that GNNs are vulnerable to injective attacks. Due to the openness of platforms, attackers can inject malicious nodes with carefully \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zirak:2024:SLB, author = "Farzaneh Zirak and Farhana Choudhury and Renata Borovica-Gajic", title = "{SeLeP}: Learning Based Semantic Prefetching for Exploratory Database Workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "2064--2076", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659458", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659458", abstract = "Prefetching is a crucial technique employed in traditional databases to enhance interactivity, particularly in the context of data exploration. Data exploration is a query processing paradigm in which users search for insights buried in the data, often \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2024:CEF, author = "Yiwei Chen and Kaiyu Li and Guoliang Li and Yong Wang", title = "Contributions Estimation in Federated Learning: a Comprehensive Experimental Evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "2077--2090", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659459", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659459", abstract = "Federated Learning (FL) provides a privacy-preserving and decentralized approach to collaborative machine learning for multiple FL clients. The contribution estimation mechanism in FL is extensively studied within the database community, which aims to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Maroulis:2024:VAT, author = "Stavros Maroulis and Vassilis Stamatopoulos and George Papastefanatos and Manolis Terrovitis", title = "Visualization-Aware Time Series Min-Max Caching with Error Bound Guarantees", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "2091--2103", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659460", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659460", abstract = "This paper addresses the challenges in interactive visual exploration of large multi-variate time series data. Traditional data reduction techniques may improve latency but can distort visualizations. State-of-the-art methods aimed at 100\% accurate \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kayali:2024:CFM, author = "Moe Kayali and Anton Lykov and Ilias Fountalis and Nikolaos Vasiloglou and Dan Olteanu and Dan Suciu", title = "{Chorus}: Foundation Models for Unified Data Discovery and Exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "2104--2114", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659461", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659461", abstract = "We apply foundation models to data discovery and exploration tasks. Foundation models are large language models (LLMS) that show promising performance on a range of diverse tasks unrelated to their training. We show that these models are highly \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Leis:2024:CND, author = "Viktor Leis and Christian Dietrich", title = "Cloud-Native Database Systems and Unikernels: Reimagining {OS} Abstractions for Modern Hardware", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "8", pages = "2115--2122", month = apr, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3659437.3659462", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Jun 1 06:18:48 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3659437.3659462", abstract = "This paper explores the intersection of operating systems and database systems, focusing on the potential of specialized kernels for cloud-native database systems. Although the idea of custom, DBMS-optimized OS kernels is old, it is largely unrealized \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xiong:2024:CEC, author = "Haoran Xiong and Hang Zhang and Zeyu Wang and Zhenying He and Peng Wang and X. Sean Wang", title = "{CIVET}: Exploring Compact Index for Variable-Length Subsequence Matching on Time Series", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2123--2135", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665845", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665845", abstract = "Nowadays the demands for managing and analyzing substantially increasing collections of time series are becoming more challenging. Subsequence matching, as a core subroutine in time series analysis, has drawn significant research attention. Most of the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kittivorawong:2024:SGV, author = "Chanwut Kittivorawong and Yongming Ge and Yousef Helal and Alvin Cheung", title = "{Spatialyze}: a Geospatial Video Analytics System with Spatial-Aware Optimizations", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2136--2148", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665846", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665846", abstract = "Videos that are shot using commodity hardware such as phones and surveillance cameras record various metadata such as time and location. We encounter such geospatial videos on a daily basis and such videos have been growing in volume significantly. Yet, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yin:2024:OMS, author = "Hanyan Yin and Dongxie Wen and Jiajun Li and Zhewei Wei and Xiao Zhang and Zengfeng Huang and Feifei Li", title = "Optimal Matrix Sketching over Sliding Windows", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2149--2161", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665847", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665847", abstract = "Matrix sketching, aimed at approximating a matrix $ A \in R^{N \times d} $ consisting of vector streams of length $N$ with a smaller sketching matrix $ B \in R^{l \times d}, l \ll N$, has garnered increasing attention in fields such as large-scale data analytics and machine learning. A \ldots {}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Baca:2024:WFE, author = "Radim Baca", title = "Window Function Expression: Let the Self-Join Enter", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2162--2174", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665848", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665848", abstract = "Window function expressions (WFEs) became part of the SQL:2003 standard, and since then, they have often been implemented in database systems (DBS). They are especially essential to OLAP DBSs, and people use them daily. Even though WFEs are a heavily \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kakaraparthy:2024:SSD, author = "Aarati Kakaraparthy and Jignesh M. Patel", title = "{SplitDF}: Splitting Dataframes for Memory-Efficient Data Analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2175--2184", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665849", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665849", abstract = "Dataframe is a popular construct in data analysis libraries that offers a tabular view of the data. However, data within a dataframe often has redundancy, which can lead to high memory utilization of data analysis libraries. Inspired by the process of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Daliri:2024:SMI, author = "Majid Daliri and Juliana Freire and Christopher Musco and A{\'e}cio Santos and Haoxiang Zhang", title = "Sampling Methods for Inner Product Sketching", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2185--2197", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665850", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665850", abstract = "Recently, Bessa et al. (PODS 2023) showed that sketches based on coordinated weighted sampling theoretically and empirically outperform popular linear sketching methods like Johnson-Lindentrauss projection and CountSketch for the ubiquitous problem of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hu:2024:DDI, author = "Han Hu and Jiye Qiu and Hongzhi Wang and Bin Liang and Songling Zou", title = "{DIDS}: Double Indices and Double Summarizations for Fast Similarity Search", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2198--2211", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665851", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665851", abstract = "Data series has been one of the significant data forms in various applications. It becomes imperative to devise a data series index that supports both approximate and exact similarity searches for large data series collections in high-dimensional metric \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xu:2024:IGC, author = "Qian Xu and Juan Yang and Feng Zhang and Zheng Chen and Jiawei Guan and Kang Chen and Ju Fan and Youren Shen and Ke Yang and Yu Zhang and Xiaoyong Du", title = "Improving Graph Compression for Efficient Resource-Constrained Graph Analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2212--2226", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665852", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665852", abstract = "Recent studies have shown the promise of directly processing compressed graphs. However, its benefits have been limited by high peak-memory usage and unbearably long compression time. In this paper, we introduce Laconic, a novel rule-based graph \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:EUC, author = "Jianwei Wang and Kai Wang and Xuemin Lin and Wenjie Zhang and Ying Zhang", title = "Efficient Unsupervised Community Search with Pre-Trained Graph Transformer", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2227--2240", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665853", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665853", abstract = "Community search has aroused widespread interest in the past decades. Among existing solutions, the learning-based models exhibit outstanding performance in terms of accuracy by leveraging labels to (1) train the model for community score learning, and (2) \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wei:2024:LLS, author = "Jiuqi Wei and Botao Peng and Xiaodong Lee and Themis Palpanas", title = "{DET-LSH}: a Locality-Sensitive Hashing Scheme with Dynamic Encoding Tree for Approximate Nearest Neighbor Search", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2241--2254", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665854", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665854", abstract = "Locality-sensitive hashing (LSH) is a well-known solution for approximate nearest neighbor (ANN) search in high-dimensional spaces due to its robust theoretical guarantee on query accuracy. Traditional LSH-based methods mainly focus on improving the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2024:BEA, author = "Haoyu Liu and Siqiang Luo", title = "{BIRD}: Efficient Approximation of Bidirectional Hidden Personalized {PageRank}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2255--2268", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665855", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665855", abstract = "In bipartite graph analysis, similarity measures play a pivotal role in various applications. Among existing metrics, the Bidirectional Hidden Personalized PageRank (BHPP) stands out for its superior query quality. However, the computational expense of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yu:2024:GGP, author = "Zihao Yu and Ningyi Liao and Siqiang Luo", title = "{GENTI}: {GPU}-Powered Walk-Based Subgraph Extraction for Scalable Representation Learning on Dynamic Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2269--2278", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665856", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665856", abstract = "Graph representation learning is an emerging task for effectively embedding graph-structured data with learned features. Among them, Subgraph-based GRL (SGRL) methods have demonstrated better scalability and expressiveness for large-scale tasks. The core \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Feuer:2024:ANF, author = "Benjamin Feuer and Yurong Liu and Chinmay Hegde and Juliana Freire", title = "{ArcheType}: a Novel Framework for Open-Source Column Type Annotation Using Large Language Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2279--2292", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665857", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665857", abstract = "Existing deep-learning approaches to semantic column type annotation (CTA) have important shortcomings: they rely on semantic types which are fixed at training time; require a large number of training samples per type; incur high run-time inference costs;. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chang:2024:TSM, author = "Yanchuan Chang and Egemen Tanin and Gao Cong and Christian S. Jensen and Jianzhong Qi", title = "Trajectory Similarity Measurement: an Efficiency Perspective", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2293--2306", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665858", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665858", abstract = "Trajectories that capture object movement have numerous applications, in which similarity computation between trajectories often plays a key role. Traditionally, trajectory similarity is quantified by means of non-learned measures, e.g., Hausdorff, that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wheatman:2024:BUF, author = "Brian Wheatman and Xiaojun Dong and Zheqi Shen and Laxman Dhulipala and Jakub Lacki and Prashant Pandey and Helen Xu", title = "{BYO}: a Unified Framework for Benchmarking Large-Scale Graph Containers", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2307--2320", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665859", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665859", abstract = "A fundamental building block in any graph algorithm is a graph container --- - a data structure used to represent the graph. Ideally, a graph container enables efficient access to the underlying graph, has low space usage, and supports updating the graph \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2024:SVD, author = "Yizheng Zhu and Yuncheng Wu and Zhaojing Luo and Beng Chin Ooi and Xiaokui Xiao", title = "Secure and Verifiable Data Collaboration with Low-Cost Zero-Knowledge Proofs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2321--2334", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665860", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665860", abstract = "Federated Learning (FL) emerges as a viable solution to facilitate data collaboration, enabling multiple clients to collaboratively train a machine learning (ML) model under the supervision of a central server while ensuring the confidentiality of their \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Nagda:2024:RDD, author = "Heena Nagda and Shubhendra Pal Singhal and Mohammad Javad Amiri and Boon Thau Loo", title = "{Rashnu}: Data-Dependent Order-Fairness", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2335--2348", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665861", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665861", abstract = "Distributed data management systems use state Machine Replication (SMR) to provide fault tolerance. The SMR algorithm enables Byzantine Fault-Tolerant (BFT) protocols to guarantee safety and liveness despite the malicious failure of nodes. However, SMR \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2024:SBA, author = "Yuchuan Huang and Mohamed F. Mokbel", title = "{Sparcle}: Boosting the Accuracy of Data Cleaning Systems through Spatial Awareness", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2349--2362", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665862", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665862", abstract = "Though data cleaning systems have earned great success and wide spread in both academia and industry, they fall short when trying to clean spatial data. The main reason is that state-of-the-art data cleaning systems mainly rely on functional dependency \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qiu:2024:TTC, author = "Xiangfei Qiu and Jilin Hu and Lekui Zhou and Xingjian Wu and Junyang Du and Buang Zhang and Chenjuan Guo and Aoying Zhou and Christian S. Jensen and Zhenli Sheng and Bin Yang", title = "{TFB}: Towards Comprehensive and Fair Benchmarking of Time Series Forecasting Methods", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2363--2377", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665863", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665863", abstract = "Time series are generated in diverse domains such as economic, traffic, health, and energy, where forecasting of future values has numerous important applications. Not surprisingly, many forecasting methods are being proposed. To ensure progress, it is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2024:FFG, author = "Chengjun Liu and Zhuo Peng and Weiguo Zheng and Lei Zou", title = "{FSM}: a Fine-Grained Splitting and Merging Framework for Dual-Balanced Graph Partition", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2378--2391", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665864", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665864", abstract = "Partitioning a large graph into smaller subgraphs by minimizing the number of cutting vertices and edges, namely cut size or replication factor, plays a crucial role in distributed graph processing tasks. However, many prior works have primarily focused \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Marchesin:2024:ERE, author = "Stefano Marchesin and Gianmaria Silvello", title = "Efficient and Reliable Estimation of Knowledge Graph Accuracy", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "9", pages = "2392--2403", month = may, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3665844.3665865", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:54 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3665844.3665865", abstract = "Data accuracy is a central dimension of data quality, especially when dealing with Knowledge Graphs (KGs). Auditing the accuracy of KGs is essential to make informed decisions in entity-oriented services or applications. However, manually evaluating the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2024:BID, author = "Wei Zhou and Chen Lin and Xuanhe Zhou and Guoliang Li", title = "Breaking It Down: an In-Depth Study of Index Advisors", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2405--2418", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675035", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675035", abstract = "Index advisors aim to improve workload performance by judiciously selecting an appropriate set of indexes. Various heuristic-based and learning-based methods have been proposed. However, there lacks a comprehensive assessment of existing index advisors, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Deng:2024:AMC, author = "Wen Deng and Weiguo Zheng and Hong Cheng", title = "Accelerating Maximal Clique Enumeration via Graph Reduction", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2419--2431", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675036", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675036", abstract = "As a fundamental task in graph data management, maximal clique enumeration (MCE) has attracted extensive attention from both academic and industrial communities due to its wide range of applications. However, MCE is very challenging as the number of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bai:2024:PPB, author = "Jiyang Bai and Peixiang Zhao", title = "{Poligras}: Policy-Based Graph Summarization", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2432--2444", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675037", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675037", abstract = "Large graphs are ubiquitous. Their sizes, rates of growth, and complexity, however, have significantly outpaced human capabilities to ingest and make sense of them. As a cost-effective graph simplification technique, graph summarization is aimed to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zheng:2024:SSW, author = "Leqian Zheng and Lei Xu and Cong Wang and Sheng Wang and Yuke Hu and Zhan Qin and Feifei Li and Kui Ren", title = "{SWAT}: a System-Wide Approach to Tunable Leakage Mitigation in Encrypted Data Stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2445--2458", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675038", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675038", abstract = "Numerous studies have underscored the significant privacy risks associated with various leakage patterns in encrypted data stores. While many solutions have been proposed to mitigate these leakages, they either (1) incur substantial overheads, (2) focus \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:TTI, author = "Kai Wang and Yuwei Xu and Siqiang Luo", title = "{TIGER}: Training Inductive Graph Neural Network for Large-Scale Knowledge Graph Reasoning", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2459--2472", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675039", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675039", abstract = "Knowledge Graph (KG) Reasoning plays a vital role in various applications by predicting missing facts from existing knowledge. Inductive KG reasoning approaches based on Graph Neural Networks (GNNs) have shown impressive performance, particularly when \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:ISW, author = "Chao Zhang and Angela Bonifati and M. Tamer {\"O}zsu", title = "Incremental Sliding Window Connectivity over Streaming Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2473--2486", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675040", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675040", abstract = "We study index-based processing for connectivity queries within sliding windows on streaming graphs. These queries, which determine whether two vertices belong to the same connected component, are fundamental operations in real-time graph data processing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cai:2024:CEC, author = "Qingpeng Cai and Kaiping Zheng and H. V. Jagadish and Beng Chin Ooi and James Yip", title = "{CohortNet}: Empowering Cohort Discovery for Interpretable Healthcare Analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2487--2500", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675041", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675041", abstract = "Cohort studies are of significant importance in the field of healthcare analytics. However, existing methods typically involve manual, labor-intensive, and expert-driven pattern definitions or rely on simplistic clustering techniques that lack medical \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:EIM, author = "Jinghao Wang and Yanping Wu and Xiaoyang Wang and Ying Zhang and Lu Qin and Wenjie Zhang and Xuemin Lin", title = "Efficient Influence Minimization via Node Blocking", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2501--2513", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675042", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675042", abstract = "Given a graph G, a budget k and a misinformation seed set S, Influence Minimization (IMIN) via node blocking aims to find a set of k nodes to be blocked such that the expected spread of S is minimized. This problem finds important applications in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2024:DBD, author = "Xuanhe Zhou and Guoliang Li and Zhaoyan Sun and Zhiyuan Liu and Weize Chen and Jianming Wu and Jiesi Liu and Ruohang Feng and Guoyang Zeng", title = "{D-Bot}: Database Diagnosis System using Large Language Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2514--2527", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675043", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675043", abstract = "Database administrators (DBAs) play an important role in managing database systems. However, it is hard and tedious for DBAs to manage vast database instances and give timely response (waiting for hours is intolerable in many online cases). In addition, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qiao:2024:BFS, author = "Yiming Qiao and Yihan Gao and Huanchen Zhang", title = "{Blitzcrank}: Fast Semantic Compression for In-Memory Online Transaction Processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2528--2540", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675044", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675044", abstract = "We present Blitzcrank, a high-speed semantic compressor designed for OLTP databases. Previous solutions are inadequate for compressing row-stores: they suffer from either low compression factor due to a coarse compression granularity or suboptimal \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2024:SSS, author = "Zhihao Chen and Tianji Yang and Yixiao Zheng and Zhao Zhang and Cheqing Jin and Aoying Zhou", title = "{Spectrum}: Speedy and Strictly-Deterministic Smart Contract Transactions for Blockchain Ledgers", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2541--2554", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675045", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675045", abstract = "Today, blockchain ledgers utilize concurrent deterministic execution schemes to scale up. However, ordering fairness is not preserved in these schemes: although they ensure all replicas achieve the same serial order, this order does not always align with \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:FCG, author = "Zihao Zhang and Huiqi Hu and Xuan Zhou and Yaofeng Tu and Weining Qian and Aoying Zhou", title = "Fast Commitment for Geo-Distributed Transactions via Decentralized Co-Coordinators", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2555--2567", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675046", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675046", abstract = "In a geo-distributed database, data shards and their respective replicas are deployed in distinct datacenters across multiple regions, enabling regional-level disaster recovery and the ability to serve global users locally. However, transaction \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lerner:2024:CRS, author = "Alberto Lerner and Gustavo Alonso", title = "{CXL} and the Return of Scale-Up Database Engines", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2568--2575", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675047", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675047", abstract = "The trend toward specialized processing devices such as TPUs, DPUs, GPUs, and FPGAs has exposed the weaknesses of PCIe in interconnecting these devices and their hosts. Several attempts have been proposed to improve, augment, or downright replace PCIe, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fang:2024:IAC, author = "Shuheng Fang and Kangfei Zhao and Yu Rong and Zhixun Li and Jeffrey Xu Yu", title = "Inductive Attributed Community Search: To Learn Communities Across Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2576--2589", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675048", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675048", abstract = "Attributed community search (ACS) aims to identify subgraphs satisfying both structure cohesiveness and attribute homogeneity in attributed graphs, for a given query that contains query nodes and query attributes. Previously, algorithmic approaches deal \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yuan:2024:ELC, author = "Long Yuan and Xia Li and Zi Chen and Xuemin Lin and Xiang Zhao and Wenjie Zhang", title = "{I/O} Efficient Label-Constrained Reachability Queries in Large Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2590--2602", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675049", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675049", abstract = "Computing the reachability between two vertices in a graph is a fundamental problem in graph data analysis. Most of the existing works assume that the edges in the graph have no labels, but in many real application scenarios, edges naturally come with \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lu:2024:DSR, author = "Baotong Lu and Kaisong Huang and Chieh-Jan Mike Liang and Tianzheng Wang and Eric Lo", title = "{DEX}: Scalable Range Indexing on Disaggregated Memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2603--2616", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675050", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675050", abstract = "Memory disaggregation can potentially allow memory-optimized range indexes such as B+-trees to scale beyond one machine while attaining high hardware utilization and low cost. Designing scalable indexes on disaggregated memory, however, is challenging \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ni:2024:ADR, author = "Wei Ni and Xiaoye Miao and Xiangyu Zhao and Yangyang Wu and Shuwei Liang and Jianwei Yin", title = "Automatic Data Repair: Are We Ready to Deploy?", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2617--2630", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675051", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675051", abstract = "Data quality is paramount in today's data-driven world, especially in the era of generative AI. Dirty data with errors and inconsistencies usually leads to flawed insights, unreliable decision-making, and biased or low-quality outputs from generative \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chang:2024:BHM, author = "Chaokun Chang and Eric Lo and Chunxiao Ye", title = "{Biathlon}: Harnessing Model Resilience for Accelerating {ML} Inference Pipelines", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2631--2640", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675052", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675052", abstract = "Machine learning inference pipelines commonly encountered in data science and industries often require real-time responsiveness due to their user-facing nature. However, meeting this requirement becomes particularly challenging when certain input \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zeng:2024:DSD, author = "Yuanyuan Zeng and Chenhao Ma and Yixiang Fang", title = "Distributed Shortest Distance Labeling on Large-Scale Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2641--2653", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675053", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675053", abstract = "Distance labeling approaches are widely adopted to speed up the shortest-distance query performance. Due to the explosive growth of data graphs, a single machine can hardly satisfy the requirements of both computational power and memory capacity, which \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Luo:2024:EPD, author = "Wensheng Luo and Yixiang Fang and Chunxu Lin and Yingli Zhou", title = "Efficient Parallel {D}-Core Decomposition at Scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2654--2667", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675054", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675054", abstract = "Directed graphs are prevalent in social networks, web networks, and communication networks. A well-known concept of the directed graph is the D-core, or ( k, l )-core, which is the maximal subgraph in which each vertex has an in-degree not less than k and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pellegrina:2024:EDS, author = "Leonardo Pellegrina and Fabio Vandin", title = "Efficient Discovery of Significant Patterns with Few-Shot Resampling", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "10", pages = "2668--2680", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3675034.3675055", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Wed Aug 7 06:07:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3675034.3675055", abstract = "Significant pattern mining is a fundamental task in mining transactional data, requiring to identify patterns significantly associated with the value of a given feature, the target. In several applications, such as biomedicine, basket market analysis, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2024:RBP, author = "Qixu Chen and Raymond Chi-Wing Wong", title = "Robust Best Point Selection under Unreliable User Feedback", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2681--2693", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681955", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681955", abstract = "The task of finding a user's utility function (representing the user's preference) by asking them to compare pairs of points through a series of questions, each requiring him/her to compare 2 points for choosing a more preferred one, to find the best \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cheng:2024:TOT, author = "Audrey Cheng and Aaron Kabcenell and Jason Chan and Xiao Shi and Peter Bailis and Natacha Crooks and Ion Stoica", title = "Towards Optimal Transaction Scheduling", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2694--2707", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681956", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681956", abstract = "Maximizing transaction throughput is key to high-performance database systems, which focus on minimizing data access conflicts to improve performance. However, finding efficient schedules that reduce conflicts remains an open problem. For efficiency, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Campos:2024:QDE, author = "David Campos and Bin Yang and Tung Kieu and Miao Zhang and Chenjuan Guo and Christian S. Jensen", title = "{QCore}: Data-Efficient, On-Device Continual Calibration for Quantized Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2708--2721", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681957", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681957", abstract = "We are witnessing an increasing availability of streaming data that may contain valuable information on the underlying processes. It is thus attractive to be able to deploy machine learning models, e.g., for classification, on edge devices near sensors \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:EAP, author = "Yalong Zhang and Rong-Hua Li and Qi Zhang and Hongchao Qin and Lu Qin and Guoren Wang", title = "Efficient Algorithms for Pseudoarboricity Computation in Large Static and Dynamic Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2722--2734", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681958", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681958", abstract = "The arboricity a ( G ) of a graph G is defined as the minimum number of edge-disjoint forests that the edge set of G can be partitioned into. It is a fundamental metric and has been widely used in many graph analysis applications. However, computing a ( G ) is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2024:RPB, author = "Meng Chen and Kai Zhang and Zhenying He and Yinan Jing and X. Sean Wang", title = "{RoarGraph}: a Projected Bipartite Graph for Efficient Cross-Modal Approximate Nearest Neighbor Search", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2735--2749", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681959", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681959", abstract = "Approximate Nearest Neighbor Search (ANNS) is a fundamental and critical component in many applications, including recommendation systems and large language model-based applications. With the advancement of multimodal neural models, which transform data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2024:CSL, author = "Ju Fan and Zihui Gu and Songyue Zhang and Yuxin Zhang and Zui Chen and Lei Cao and Guoliang Li and Samuel Madden and Xiaoyong Du and Nan Tang", title = "Combining Small Language Models and Large Language Models for Zero-Shot {NL2SQL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2750--2763", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681960", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681960", abstract = "Zero-shot natural language to SQL (NL2SQL) aims to generalize pretrained NL2SQL models to new environments ( e.g., new databases and new linguistic phenomena) without any annotated NL2SQL samples from these environments. Existing approaches either use \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Guliyev:2024:DGD, author = "Rustam Guliyev and Aparajita Haldar and Hakan Ferhatosmanoglu", title = "{D3-GNN}: Dynamic Distributed Dataflow for Streaming Graph Neural Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2764--2777", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681961", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681961", abstract = "Graph Neural Network (GNN) models on streaming graphs entail algorithmic challenges to continuously capture its dynamic state, as well as systems challenges to optimize latency, memory, and throughput during both inference and training. We present D3-GNN,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Su:2024:DBO, author = "Yunxiang Su and Shaoxu Song and Xiangdong Huang and Chen Wang and Jianmin Wang", title = "Distance-Based Outlier Query Optimization in {Apache IoTDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2778--2790", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681962", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681962", abstract = "While outlier detection has been widely studied over streaming data, the query of outliers in time series databases was largely overlooked. Apache IoTDB, an open-source time series database, employs LSM-tree based storage to support intensive writing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2024:TMF, author = "Jianye Yang and Sheng Fang and Zhaoquan Gu and Ziyi Ma and Xuemin Lin and Zhihong Tian", title = "{TC-Match}: Fast Time-Constrained Continuous Subgraph Matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2791--2804", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681963", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681963", abstract = "Continuously monitoring structural patterns in streaming graphs is a critical task in many real-time graph-based applications. In this paper, we study the problem of time-constrained continuous subgraph matching (shorted as TCSM) over streaming graphs. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wornow:2024:AEF, author = "Michael Wornow and Avanika Narayan and Krista Opsahl-Ong and Quinn McIntyre and Nigam Shah and Christopher R{\'e}", title = "Automating the Enterprise with Foundation Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2805--2812", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681964", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681964", abstract = "Automating enterprise workflows could unlock \$4 trillion/year in productivity gains. Despite being of interest to the data management community for decades, the ultimate vision of end-to-end workflow automation has remained elusive. Current solutions rely on process mining and robotic process automation (RPA), in which a bot is hard-coded to follow a set of predefined rules for completing a workflow. Through case studies of a hospital and large B2B enterprise, we find that the adoption of RPA has been inhibited by high set-up costs (12--18 months), unreliable execution (60\% initial accuracy), and burdensome maintenance (requiring multiple FTEs). Multimodal foundation models (FMs) such as GPT-4 offer a promising new approach for end-to-end workflow automation given their generalized reasoning and planning abilities. To study these capabilities we propose ECLAIR, a system to automate enterprise workflows with minimal human supervision. We conduct initial experiments showing that multimodal FMs can address the limitations of traditional RPA with (1) near-human-level understanding of workflows (93\% accuracy on a workflow understanding task) and (2) instant set-up with minimal technical barrier (based solely on a natural language description of a workflow, ECLAIR achieves end-to-end completion rates of 40\%). We identify human-AI collaboration, validation, and self-improvement as open challenges, and suggest ways they can be solved with data management techniques.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tian:2024:EIT, author = "Anxin Tian and Alexander Zhou and Yue Wang and Xun Jian and Lei Chen", title = "Efficient Index for Temporal Core Queries over Bipartite Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2813--2825", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681965", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681965", abstract = "Many real-world binary relations can be modelled as bipartite graphs, which can be inherently temporal and each edge is associated with a timestamp. The $ (\alpha, \beta)$-core, a popular structure that requires minimum degrees over two layers of vertices, is useful \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kato:2024:UFF, author = "Fumiyuki Kato and Li Xiong and Shun Takagi and Yang Cao and Masatoshi Yoshikawa", title = "{Uldp-FL}: Federated Learning with Across-Silo User-Level Differential Privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2826--2839", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681966", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681966", abstract = "Differentially Private Federated Learning (DP-FL) has garnered attention as a collaborative machine learning approach that ensures formal privacy. Most DP-FL approaches ensure DP at the record-level within each silo for cross-silo FL. However, a single \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2024:EFI, author = "Junyong Yang and Ming Zhong and Yuanyuan Zhu and Tieyun Qian and Mengchi Liu and Jeffrey Xu Yu", title = "Evolution Forest Index: Towards Optimal Temporal $k$-Core Component Search via Time-Topology Isomorphic Computation", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2840--2853", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681967", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681967", abstract = "For a temporal graph like transaction network, finding a densely connected subgraph that contains a vertex like a suspicious account during a period is valuable. Thus, we study the Temporal k -Core Component Search (TCCS) problem, which aims to find a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ma:2024:EDP, author = "Yuxin Ma and Ping Gong and Tianming Wu and Jiawei Yi and Chengru Yang and Cheng Li and Qirong Peng and Guiming Xie and Yongcheng Bao and Haifeng Liu and Yinlong Xu", title = "Eliminating Data Processing Bottlenecks in {GNN} Training over Large Graphs via Two-level Feature Compression", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2854--2866", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681968", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681968", abstract = "Training GNNs over large graphs faces a severe data processing bottleneck, involving both sampling and feature loading. To tackle this issue, we introduce F$^2$ CGT, a fast GNN training system incorporating feature compression. To avoid potential accuracy \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rumbaugh:2024:TSI, author = "Douglas B. Rumbaugh and Dong Xie and Zhuoyue Zhao", title = "Towards Systematic Index Dynamization", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2867--2879", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681969", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681969", abstract = "There is significant interest in examining large datasets using complex domain-specific queries. In many cases, these queries can be accelerated using specialized indexes. Unfortunately, the development of a practical index is difficult, because \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Treder-Tschechlov:2024:ECB, author = "Dennis Treder-Tschechlov and Manuel Fritz and Holger Schwarz and Bernhard Mitschang", title = "Ensemble Clustering Based on Meta-Learning and Hyperparameter Optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2880--2892", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681970", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681970", abstract = "Efficient clustering algorithms, such as k -Means, are often used in practice because they scale well for large datasets. However, they are only able to detect simple data characteristics. Ensemble clustering can overcome this limitation by combining \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Guo:2024:ESR, author = "Chenjuan Guo and Ronghui Xu and Bin Yang and Ye Yuan and Tung Kieu and Yan Zhao and Christian S. Jensen", title = "Efficient Stochastic Routing in Path-Centric Uncertain Road Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2893--2905", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681971", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681971", abstract = "The availability of massive vehicle trajectory data enables the modeling of road-network constrained movement as travel-cost distributions rather than just single-valued costs, thereby capturing the inherent uncertainty of movement and enabling improved \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bonifati:2024:TPG, author = "Angela Bonifati and Filip Murlak and Yann Ramusat", title = "Transforming Property Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2906--2918", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681972", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681972", abstract = "In this paper, we study a declarative framework for specifying transformations of property graphs. In order to express such transformations, we leverage queries formulated in the Graph Pattern Calculus (GPC), which is an abstraction of the common core of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2024:LLM, author = "Yushi Sun and Hao Xin and Kai Sun and Yifan Ethan Xu and Xiao Yang and Xin Luna Dong and Nan Tang and Lei Chen", title = "Are Large Language Models a Good Replacement of Taxonomies?", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2919--2932", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681973", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681973", abstract = "Large language models (LLMs) demonstrate an impressive ability to internalize knowledge and answer natural language questions. Although previous studies validate that LLMs perform well on general knowledge while presenting poor performance on long-tail \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:EAD, author = "Yalong Zhang and Rong-Hua Li and Qi Zhang and Hongchao Qin and Guoren Wang", title = "Efficient Algorithms for Density Decomposition on Large Static and Dynamic Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2933--2945", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681974", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681974", abstract = "Locally-densest subgraph (LDS) decomposition is a fundamental decomposition in graph analysis that finds numerous applications in various domains, including community detection, fraud detection, graph querying, and graph visualization. However, the LDS \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2024:EMM, author = "Yingli Zhou and Yixiang Fang and Chenhao Ma and Tianci Hou and Xin Huang", title = "Efficient Maximal {Motif}-Clique Enumeration over Large Heterogeneous Information Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2946--2959", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681975", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681975", abstract = "In the heterogeneous information network (HIN), a motif-clique is a ``complete graph'' for a given motif (or a small connected graph) that could capture the desired relationship in the motif. The maximal motif-cliques of HINs have found various \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sheng:2024:OCR, author = "Zeang Sheng and Wentao Zhang and Yangyu Tao and Bin Cui", title = "{OUTRE}: an {OUT-of-Core De-REdundancy} {GNN} Training Framework for Massive Graphs within A Single Machine", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2960--2973", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681976", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681976", abstract = "Sampling-based Graph Neural Networks (GNNs) have become the de facto standard for handling various graph learning tasks on large-scale graphs. As the graph size grows larger and even exceeds the standard host memory size of a single machine, out-of-core \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fang:2024:RSA, author = "Chenguang Fang and Zijie Chen and Shaoxu Song and Xiangdong Huang and Chen Wang and Jianmin Wang", title = "On Reducing Space Amplification with Multi-Column Compaction in {Apache IoTDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2974--2986", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681977", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681977", abstract = "Log-structured merge trees (LSM-trees) are commonly employed as the storage engines for write-intensive workloads in modern time series databases including Apache IoTDB. Following append-only principle, LSM-trees can handle intensive writes and updates, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schmidl:2024:AUH, author = "Sebastian Schmidl and Felix Naumann and Thorsten Papenbrock", title = "{AutoTSAD}: Unsupervised Holistic Anomaly Detection for Time Series Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "2987--3002", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681978", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681978", abstract = "Detecting anomalous subsequences in time series data is one of the key tasks in time series analytics, having applications in environmental monitoring, preventive healthcare, predictive maintenance, and many further areas. Data scientists have developed \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2024:EWB, author = "Zheng Chen and Feng Zhang and Yang Chen and Xiaokun Fang and Guanyu Feng and Xiaowei Zhu and Wenguang Chen and Xiaoyong Du", title = "Enabling Window-Based Monotonic Graph Analytics with Reusable Transitional Results for Pattern-Consistent Queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3003--3016", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681979", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681979", abstract = "Evolving graphs consisting of slices are large and constantly changing. For example, in Alipay, the graph generates hundreds of millions of new transaction records every day. Analyzing the graph within a temporary window is time-consuming due to the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ramos:2024:WAS, author = "Maria Ramos and Jo{\~a}o Azevedo and Kyle Kingsbury and Jos{\'e} Pereira and T{\^a}nia Esteves and Ricardo Macedo and Jo{\~a}o Paulo", title = "When Amnesia Strikes: Understanding and Reproducing Data Loss Bugs with Fault Injection", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3017--3030", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681980", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681980", abstract = "We present LazyFS, a new fault injection tool that simplifies the debugging and reproduction of complex data durability bugs experienced by databases, key-value stores, and other data-centric systems in crashes. Our tool simulates persistence properties \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:PTA, author = "Leixia Wang and Qingqing Ye and Haibo Hu and Xiaofeng Meng", title = "{PriPL-Tree}: Accurate Range Query for Arbitrary Distribution under Local Differential Privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3031--3044", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681981", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681981", abstract = "Answering range queries in the context of Local Differential Privacy (LDP) is a widely studied problem in Online Analytical Processing (OLAP). Existing LDP solutions all assume a uniform data distribution within each domain partition, which may not align \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2024:WWS, author = "Yu Sun and Jingyu Zhu and Xiao Xu and Xian Xu and Yuyao Sun and Shaoxu Song and Xiang Li and Xiaojie Yuan", title = "{Win-Win}: On Simultaneous Clustering and Imputing over Incomplete Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3045--3057", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681982", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681982", abstract = "Although clustering methods have shown promising performance in various applications, they cannot effectively handle incomplete data. Existing studies often impute missing values first before clustering analysis and conduct these two processes \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Takagi:2024:HDP, author = "Shun Takagi and Li Xiong and Fumiyuki Kato and Yang Cao and Masatoshi Yoshikawa", title = "{HRNet}: Differentially Private Hierarchical and Multi-Resolution Network for Human Mobility Data Synthesization", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3058--3071", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681983", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681983", abstract = "Human mobility data offers valuable insights for many applications such as urban planning and pandemic response, but its use also raises privacy concerns. In this paper, we introduce the Hierarchical and Multi-Resolution Network (HRNet), a novel deep \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dong:2024:EMI, author = "Sijie Dong and Qitong Wang and Soror Sahri and Themis Palpanas and Divesh Srivastava", title = "Efficiently Mitigating the Impact of Data Drift on Machine Learning Pipelines", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3072--3081", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681984", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Nov 9 16:34:53 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3681954.3681984", abstract = "Despite the increasing success of Machine Learning (ML) techniques in real-world applications, their maintenance over time remains challenging. In particular, the prediction accuracy of deployed ML models can suffer due to significant changes between \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:PEA, author = "Libin Wang and Raymond Chi-Wing Wong", title = "{PCSP}: Efficiently Answering Label-Constrained Shortest Path Queries in Road Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3082--3094", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681985", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Shortest path queries are ubiquitous in many spatial applications. Existing solutions assign numerical weights to edges and compute the path with the minimum sum of edge weights. However, in practice, the road categories associated with edges (e.g., toll). \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fejza:2024:EER, author = "Amela Fejza and Pierre Genev{\`e}s and Nabil Laya{\"\i}da", title = "Efficient Enumeration of Recursive Plans in Transformation-Based Query Optimizers", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3095--3108", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681986", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query optimizers built on the transformation-based Volcano/Cascades framework are used in many database systems. Transformations proposed earlier on the logical query dag (LQDAG) data structure, which is key in such a framework, are restricted to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yan:2024:ERA, author = "Mengyi Yan and Wenfei Fan and Yaoshu Wang and Min Xie", title = "Enriching Relations with Additional Attributes for {ER}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3109--3123", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681987", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper studies a new problem of relation enrichment. Given a relation D of schema R and a knowledge graph G with overlapping information, it is to identify a small number of relevant features from G, and extend schema R with the additional attributes,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:EAS, author = "Haibo Wang", title = "Enhancing Accuracy for Super Spreader Identification in High-Speed Data Streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3124--3137", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681988", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper addresses the challenge of identifying super spreaders within large, high-speed data streams. In these streams, data is segmented into flows, with each flow's spread defined as the number of distinct items it contains. A super spreader is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Seeman:2024:PAQ, author = "Jeremy Seeman and William Sexton and David Pujol and Ashwin Machanavajjhala", title = "Privately Answering Queries on Skewed Data via Per-Record Differential Privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3138--3150", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681989", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We consider the problem of the private release of statistics (like payroll) where it is critical to preserve the contribution made by a small number of outlying large entities. We propose a privacy formalism, per-record zero concentrated differential \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Al-Sayeh:2024:AAS, author = "Hani Al-Sayeh and Muhammad Attahir Jibril and Kai-Uwe Sattler", title = "{Agile-Ant}: Self-Managing Distributed Cache Management for Cost Optimization of Big Data Applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3151--3164", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681990", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed in-memory processing frameworks accelerate application runs by caching important datasets in memory. Allocating a suitable cluster configuration for caching these datasets plays a crucial role in achieving minimal cost. We present Agile-ant, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Alevizos:2024:CER, author = "Elias Alevizos and Alexander Artikis and Georgios Paliouras", title = "Complex Event Recognition with Symbolic Register Transducers", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3165--3177", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681991", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present a system for Complex Event Recognition (CER) based on automata. While multiple such systems have been described in the literature, they typically suffer from a lack of clear and denotational semantics, a limitation which often leads to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xie:2024:HHA, author = "Yupeng Xie and Yuyu Luo and Guoliang Li and Nan Tang", title = "{HAIChart}: Human and {AI} Paired Visualization System", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3178--3191", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681992", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The growing importance of data visualization in business intelligence and data science emphasizes the need for tools that can efficiently generate meaningful visualizations from large datasets. Existing tools fall into two main categories: human-powered \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:SBF, author = "Yun Wang and Chrysanthi Kosyfaki and Sihem Amer-Yahia and Reynold Cheng", title = "A Sampling-Based Framework for Hypothesis Testing on Large Attributed Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3192--3200", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681993", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Hypothesis testing is a statistical method used to draw conclusions about populations from sample data, typically represented in tables. With the prevalence of graph representations in real-life applications, hypothesis testing on graphs is gaining \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:LPA, author = "Qinbin Li and Junyuan Hong and Chulin Xie and Jeffrey Tan and Rachel Xin and Junyi Hou and Xavier Yin and Zhun Wang and Dan Hendrycks and Zhangyang Wang and Bo Li and Bingsheng He and Dawn Song", title = "{LLM-PBE}: Assessing Data Privacy in Large Language Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3201--3214", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681994", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large Language Models (LLMs) have become integral to numerous domains, significantly advancing applications in data management, mining, and analysis. Their profound capabilities in processing and interpreting complex language data, however, bring to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Birler:2024:RJP, author = "Altan Birler and Alfons Kemper and Thomas Neumann", title = "Robust Join Processing with Diamond Hardened Joins", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3215--3228", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681995", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Join ordering and join processing has a huge impact on query execution and can easily affect the query response time by orders of magnitude. In particular, when joins are potentially growing n:m joins, execution can be very expensive. This can be seen by \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zuo:2024:DET, author = "Rundong Zuo and Guozhong Li and Rui Cao and Byron Choi and Jianliang Xu and Sourav S. Bhowmick", title = "{DARKER}: Efficient Transformer with Data-Driven Attention Mechanism for Time Series", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3229--3242", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681996", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Transformer-based models have facilitated numerous applications with superior performance. A key challenge in transformers is the quadratic dependency of its training time complexity on the length of the input sequence. A recent popular solution is using \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2024:EMF, author = "Yanping Wu and Renjie Sun and Xiaoyang Wang and Dong Wen and Ying Zhang and Lu Qin and Xuemin Lin", title = "Efficient Maximal Frequent Group Enumeration in Temporal Bipartite Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3243--3255", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681997", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cohesive subgraph mining is a fundamental problem in bipartite graph analysis. In reality, relationships between two types of entities often occur at some specific timestamps, which can be modeled as a temporal bipartite graph. However, the temporal \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chao:2024:OVQ, author = "Daren Chao and Yueting Chen and Nick Koudas and Xiaohui Yu", title = "Optimizing Video Queries with Declarative Clues", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3256--3268", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681998", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Video Database Management Systems (VDBMS) leverage advancements in computer vision and deep learning for efficient video data analysis and retrieval. This paper introduces the concept of user-specified Clues, allowing users to incorporate domain-specific \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Behme:2024:FFA, author = "Lennart Behme and Sainyam Galhotra and Kaustubh Beedkar and Volker Markl", title = "{Fainder}: a Fast and Accurate Index for Distribution-Aware Dataset Search", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3269--3282", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3681999", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Efficient data discovery is crucial in the era of data-driven decisionmaking. However, current practices face significant challenges due to the intricacies of identifying datasets with specific distributional characteristics, such as percentiles, when \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yuan:2024:NAN, author = "Ye Yuan and Bo Tang and Tianfei Zhou and Zhiwei Zhang and Jianbin Qin", title = "{nsDB}: Architecting the Next Generation Database by Integrating Neural and Symbolic Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3283--3289", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682000", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we propose nsDB, a novel neuro-symbolic database system that integrates neural and symbolic system architectures natively to address the weaknesses of each, providing a strong database capable of data managing, model learning, and complex \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schmidt:2024:TBO, author = "Tobias Schmidt and Dominik Durner and Viktor Leis and Thomas Neumann", title = "Two Birds With One Stone: Designing a Hybrid Cloud Storage Engine for {HTAP}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3290--3303", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682001", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Businesses are increasingly demanding real-time analytics on up-to-date data. However, current solutions fail to efficiently combine transactional and analytical processing in a single system. Instead, they rely on extract-transform-load pipelines to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:DDO, author = "Qizhen Zhang and Philip A. Bernstein and Badrish Chandramouli and Jiasheng Hu and Yiming Zheng", title = "{DDS}: {DPU}-Optimized Disaggregated Storage", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3304--3317", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682002", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents DDS, a novel disaggregated storage architecture enabled by emerging networking hardware, namely DPUs (Data Processing Units). DPUs can optimize the latency and CPU consumption of disaggregated storage servers. However, utilizing DPUs \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:DNL, author = "Boyan Li and Yuyu Luo and Chengliang Chai and Guoliang Li and Nan Tang", title = "The Dawn of Natural Language to {SQL}: Are We Fully Ready?", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3318--3331", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682003", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Translating users' natural language questions into SQL queries ( i.e., nl2sql) significantly lowers the barriers to accessing relational databases. The emergence of Large Language Models has introduced a novel paradigm in nl2sql tasks, enhancing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Si:2024:CES, author = "Michelle Si and Jian Pei", title = "Counterfactual Explanation of {Shapley} Value in Data Coalitions", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3332--3345", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682004", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Shapley value is widely used for data valuation in data markets. However, explaining the Shapley value of an owner in a data coalition is an unexplored and challenging task. To tackle this, we formulate the problem of finding the counterfactual \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:SDL, author = "Yi Zhang and Peter Baile Chen and Zachary G. Ives", title = "Searching Data Lakes for Nested and Joined Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3346--3359", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682005", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Exploratory data science is driving new platforms that assist data scientists with everyday tasks, such as integration and wrangling, to assemble training datasets. Such tools take scientists' work-in-progress data as a search object (table or JSON) and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:EBC, author = "Xinrui Wang and Yiran Wang and Xuemin Lin and Jeffrey Xu Yu and Hong Gao and Xiuzhen Cheng and Dongxiao Yu", title = "Efficient Betweenness Centrality Computation over Large Heterogeneous Information Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3360--3372", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682006", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Betweenness centrality (BC), a classic measure which quantifies the importance of a vertex to act as a communication ``bridge'' between other vertices in the network, is widely used in many practical applications. With the advent of large heterogeneous \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:HAS, author = "William Zhang and Wan Shen Lim and Matthew Butrovich and Andrew Pavlo", title = "The Holon Approach for Simultaneously Tuning Multiple Components in a Self-Driving Database Management System with Machine Learning via Synthesized Proto-Actions", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3373--3387", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682007", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Existing machine learning (ML) approaches to automatically optimize database management systems (DBMSs) only target a single configuration space at a time (e.g., knobs, query hints, indexes). Simultaneously tuning multiple configuration spaces is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Song:2024:DCA, author = "Zhen Song and Yu Gu and Qing Sun and Tianyi Li and Yanfeng Zhang and Yushuai Li and Christian S. Jensen and Ge Yu", title = "{DynaHB}: a Communication-Avoiding Asynchronous Distributed Framework with Hybrid Batches for Dynamic {GNN} Training", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3388--3401", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682008", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Dynamic Graph Neural Networks (DGNNs) have demonstrated exceptional performance at dynamic-graph analysis tasks. However, the costs exceed those incurred by other learning tasks, to the point where deployment on large-scale dynamic graphs is infeasible. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Feng:2024:ESP, author = "Qingshuai Feng and Junhua Zhang and Wenjie Zhang and Lu Qin and Ying Zhang and Xuemin Lin", title = "Efficient {$k$NN} Search in Public Transportation Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3402--3414", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682009", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Public transportation plays a vital role in mitigating traffic congestion and reducing carbon emissions. The Top-k Nearest Neighbor ( k NN) search in public transportation networks is a fundamental problem in location-based services, which aims to find k. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2024:LOL, author = "Yifan Yang and Shimin Chen", title = "{LITS}: an Optimized Learned Index for Strings", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3415--3427", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682010", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Index is an important component in database systems. Learned indexes have been shown to outperform traditional tree-based index structures for fixed-sized integer or floating point keys. However, the application of the learned solution to variable-length \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fogli:2024:OMC, author = "Alessandro Fogli and Bo Zhao and Peter Pietzuch and Maximilian Bandle and Jana Giceva", title = "{OLAP} on Modern Chiplet-Based Processors", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3428--3441", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682011", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Chiplet-based CPUs, which combine multiple independent dies on a single package, allow hardware to scale to higher CPU core counts at the cost of more memory heterogeneity and performance variability. This introduces challenges when existing query \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hao:2024:BTM, author = "Xiangpeng Hao and Badrish Chandramouli", title = "{Bf-Tree}: a Modern Read-Write-Optimized Concurrent Larger-Than-Memory Range Index", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3442--3455", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682012", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A B-Tree is the most widely used range index for larger-than-memory data systems. It organizes data in pages (usually 4 KB) that efficiently align with disk IO operations, fully utilizing each IO operation to narrow down the search space. On the other \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hansert:2024:PDS, author = "Patrick Hansert and Sebastian Michel", title = "Partition, Don't Sort! {Compression} Boosters for Cloud Data Ingestion Pipelines", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3456--3469", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682013", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data Lakes deployed in the cloud are a go-to solution for enterprise data storage. While the pay-as-you-go cost model allows flexible resource allocation and billing, it mandates an efficient use of resources like CPU hours, network traffic, and used \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Erfanian:2024:CFM, author = "Mahdi Erfanian and H. V. Jagadish and Abolfazl Asudeh", title = "{Chameleon}: Foundation Models for Fairness-Aware Multi-Modal Data Augmentation to Enhance Coverage of Minorities", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3470--3483", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682014", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:17:40 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Potential harms from the under-representation of minorities in data, particularly in multi-modal settings, is a well-recognized concern. While there has been extensive effort in detecting such under-representation, resolution has remained a challenge. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ding:2024:DRM, author = "Xiaoou Ding and Yixing Lu and Hongzhi Wang and Chen Wang and Yida Liu and Jianmin Wang", title = "{DAFDiscover}: Robust Mining Algorithm for Dynamic Approximate Functional Dependencies on Dirty Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3484--3496", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682015", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data dependency mining plays a crucial role in understanding data relationships. To address the increasing complexities of real-world data, Approximate Functional Dependencies (AFDs) have been introduced, building upon traditional FD. However, existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mohr-Daurat:2024:HED, author = "Hubert Mohr-Daurat and Georgios Theodorakis and Holger Pirk", title = "Hardware-Efficient Data Imputation through {DBMS} Extensibility", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3497--3510", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682016", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The separation of data and code/queries has served Data Management Systems (DBMSs) well for decades. However, while the resulting soundness and rigidity are the basis for many performance-oriented optimizations, it lacks the flexibility to efficiently \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Trummer:2024:GSD, author = "Immanuel Trummer", title = "Generating Succinct Descriptions of Database Schemata for Cost-Efficient Prompting of Large Language Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3511--3523", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682017", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Using large language models (LLMs) for tasks like text-to-SQL translation often requires describing the database schema as part of the model input. LLM providers typically charge as a function of the number of tokens read. Hence, reducing the length of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Srivastava:2024:SMA, author = "Tapan Srivastava and Raul Castro Fernandez", title = "Saving Money for Analytical Workloads in the Cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3524--3537", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682018", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As users migrate their analytical workloads to cloud databases, it is becoming just as important to reduce monetary costs as it is to optimize query runtime. In the cloud, a query is billed based on either its compute time or the amount of data it \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yun:2024:RBJ, author = "Joohyung Yun and Byungchul Tak and Wook-Shin Han", title = "{ReCG}: Bottom-up {JSON} Schema Discovery Using a Repetitive Cluster-and-Generalize Framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3538--3550", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682019", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The schemalessness, one of the major advantages of JSON representation format, comes with high penalties in querying and operations by denying various critical functions such as query optimizations, indexing, or data verification. There have been \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mersy:2024:OCB, author = "Gabriel Mersy and Zhuo Wang and Stavros Sintos and Sanjay Krishnan", title = "Optimizing Collections of {Bloom} Filters within a Space Budget", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3551--3564", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682020", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With a single Bloom filter, one can approximately answer set membership queries within a space budget. Practical systems often use collections of Bloom filters to facilitate applications such as data skipping, sideways information passing, and network \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lyu:2024:SOA, author = "Chenghao Lyu and Qi Fan and Philippe Guyard and Yanlei Diao", title = "A Spark Optimizer for Adaptive, Fine-Grained Parameter Tuning", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3565--3579", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682021", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As Spark becomes a common big data analytics platform, its growing complexity makes automatic tuning of numerous parameters critical for performance. Our work on Spark parameter tuning is particularly motivated by two recent trends: Spark's Adaptive \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:TSC, author = "Zuozhi Wang and Yicong Huang and Shengquan Ni and Avinash Kumar and Sadeem Alsudais and Xiaozhen Liu and Xinyuan Lin and Yunyan Ding and Chen Li", title = "{Texera}: a System for Collaborative and Interactive Data Analytics Using Workflows", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3580--3588", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682022", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Domain experts play an important role in data science, as their knowledge can unlock valuable insights from data. As they often lack technical skills required to analyze data, they need collaborations with technical experts. In these joint efforts, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ke:2024:EVS, author = "Jin Ke and Zenon Zacouris and Maribel Acosta", title = "Efficient Validation of {SHACL} Shapes with Reasoning", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3589--3601", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682023", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As the usage of knowledge graphs (KGs) becomes more pervasive in practical applications, there is a burgeoning need for high-quality data. The SHApes Constraint Language (SHACL) allows for expressing certain types of quality constraints that define sub-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:QPQ, author = "Shuxian Wang and Sicheng Pan and Alvin Cheung", title = "{QED}: a Powerful Query Equivalence Decider for {SQL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3602--3614", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682024", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Checking query equivalence is of great significance in database systems. Prior work in automated query equivalence checking sets the first steps in formally modeling and reasoning about query optimization rules, but only supports a limited number of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kesarwani:2024:IAQ, author = "Manish Kesarwani and Jayant R. Haritsa", title = "Index Advisors on Quantum Platforms", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3615--3628", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682025", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Index Advisor tools settle for sub-optimal index configurations based on greedy heuristics, owing to the computational hardness of index selection. We investigate here how this limitation can be addressed by leveraging the computing power offered by \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yu:2024:BCU, author = "Geoffrey X. Yu and Ziniu Wu and Ferdi Kossmann and Tianyu Li and Markos Markakis and Amadou Ngom and Samuel Madden and Tim Kraska", title = "Blueprinting the Cloud: Unifying and Automatically Optimizing Cloud Data Infrastructures with {BRAD}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3629--3643", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682026", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern organizations manage their data with a wide variety of specialized cloud database engines (e.g., Aurora, BigQuery, etc.). However, designing and managing such infrastructures is hard. Developers must consider many possible designs with non-obvious \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dayan:2024:AFI, author = "Niv Dayan and Ioana-Oriana Bercea and Rasmus Pagh", title = "{Aleph Filter}: To Infinity in Constant Time", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3644--3656", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682027", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Filter data structures are widely used in various areas of computer science to answer approximate set-membership queries. In many applications, the data grows dynamically, requiring their filters to expand along with the data. However, existing methods \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2024:RRT, author = "Yuhang Chen and Jiaxin Jiang and Shixuan Sun and Bingsheng He and Min Chen", title = "{RUSH}: Real-Time Burst Subgraph Detection in Dynamic Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3657--3665", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682028", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph analytics have been effective in the data science pipeline of fraud detections. In the ever-evolving landscape of e-commerce platforms like Grab or transaction networks such as cryptos, we have witnessed the phenomenon of 'burst subgraphs,' \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liang:2024:BSD, author = "Zhicheng Liang and Yu Yang and Xiangyu Ke and Xiaokui Xiao and Yunjun Gao", title = "A Benchmark Study of Deep-{RL} Methods for Maximum Coverage Problems over Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3666--3679", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682029", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent years have witnessed a growing trend toward employing deep reinforcement learning (Deep-RL) to derive heuristics for combinatorial optimization (CO) problems on graphs. Maximum Coverage Problem (MCP) and its probabilistic variant on social \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lim:2024:HGA, author = "Wan Shen Lim and Lin Ma and William Zhang and Matthew Butrovich and Samuel Arch and Andrew Pavlo", title = "Hit the Gym: Accelerating Query Execution to Efficiently Bootstrap Behavior Models for Self-Driving Database Management Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3680--3693", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682030", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Autonomous database management systems (DBMSs) aim to optimize themselves automatically without human guidance. They rely on machine learning (ML) models that predict their run-time behavior to evaluate whether a candidate configuration is beneficial \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{vanRenen:2024:WTE, author = "Alexander van Renen and Dominik Horn and Pascal Pfeil and Kapil Vaidya and Wenjian Dong and Murali Narayanaswamy and Zhengchun Liu and Gaurav Saxena and Andreas Kipf and Tim Kraska", title = "Why {TPC} is Not Enough: an Analysis of the {Amazon Redshift Fleet}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3694--3706", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682031", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database research and development is heavily influenced by benchmarks, such as the industry-standard TPC-H and TPC-DS for analytical systems. However, these twenty-year-old benchmarks neither capture how databases are deployed nor what workloads modern \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chang:2024:ECC, author = "Lijun Chang and Rashmika Gamage and Jeffrey Xu Yu", title = "Efficient $k$-Clique Count Estimation with Accuracy Guarantee", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "11", pages = "3707--3719", month = jul, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3681954.3682032", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:22:51 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Counting and enumerating all occurrences of k -cliques, i.e., complete subgraphs with k vertices, in a large graph G is a fundamental problem with many applications. However, exact solutions are often infeasible due to the exponential growth in the number \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Eldeeb:2024:CAO, author = "Tamer Eldeeb and Sebastian Burckhardt and Reuben Bond and Asaf Cidon and Junfeng Yang and Philip A. Bernstein", title = "Cloud Actor-Oriented Database Transactions in Orleans", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3720--3730", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685801", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Microsoft Orleans is a popular open source distributed programming framework and platform which invented the virtual actor model, and has since evolved into an actor-oriented database system with the addition of database abstractions such as ACID \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schulze:2024:CLF, author = "Robert Schulze and Tom Schreiber and Ilya Yatsishin and Ryadh Dahimene and Alexey Milovidov", title = "{ClickHouse} --- Lightning Fast Analytics for Everyone", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3731--3744", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685802", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Over the past several decades, the amount of data being stored and analyzed has increased exponentially. Businesses across industries and sectors have begun relying on this data to improve products, evaluate performance, and make business-critical \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Han:2024:PRW, author = "Fusheng Han and Hao Liu and Bin Chen and Debin Jia and Jianfeng Zhou and Xuwang Teng and Chuanhui Yang and Huafeng Xi and Wei Tian and Shuning Tao and Sen Wang and Quanqing Xu and Zhenkun Yang", title = "{PALF}: Replicated Write-Ahead Logging for Distributed Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3745--3758", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685803", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed databases have been widely researched and developed in recent years due to their scalability, availability, and consistency guarantees. The write-ahead logging (WAL) system is one of the most vital components in a database. It is still a non-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2024:TRE, author = "Yixin Wu and Xiuqi Huang and Zhongjia Wei and Hang Cheng and Chaohui Xin and Zuzhi Chen and Binbin Chen and Yufei Wu and Hao Wang and Tieying Zhang and Rui Shi and Xiaofeng Gao and Yuming Liang and Pengwei Zhao and Guihai Chen", title = "Towards Resource Efficiency: Practical Insights into Large-Scale Spark Workloads at {ByteDance}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3759--3771", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685804", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "At ByteDance, where we execute over a million Spark jobs and handle 500PB of shuffled data daily, ensuring resource efficiency is paramount for cost savings. However, achieving optimization of resource efficiency in large-scale production environments \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2024:SVI, author = "Cheng Chen and Chenzhe Jin and Yunan Zhang and Sasha Podolsky and Chun Wu and Szu-Po Wang and Eric Hanson and Zhou Sun and Robert Walzer and Jianguo Wang", title = "{SingleStore-V}: an Integrated Vector Database System in {SingleStore}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3772--3785", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685805", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Vector databases have recently gained significant attention due to the emergence of large language models that produce vector embeddings for text. Existing vector databases can be broadly categorized into two types: specialized and generalized. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:GCN, author = "Guoliang Li and Wengang Tian and Jinyu Zhang and Ronen Grosman and Zongchao Liu and Sihao Li", title = "{GaussDB}: a Cloud-Native Multi-Primary Database with Compute-Memory-Storage Disaggregation", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3786--3798", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685806", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cloud-native databases have been widely deployed due to high elasticity, high availability and low cost. However, most existing cloud-native databases do not support multiple writers and thus have limitations on write throughput and scalability. To \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:LBP, author = "Hao Wang and Jiaxin Ou and Ming Zhao and Sheng Qiu and Yizheng Jiao and Yi Wang and Qizhong Mao and Zhengyu Yang and Yang Liu and Jianshun Zhang and Jianyang Hu and Jingwei Zhang and Jinrui Liu and Jiaqiang Chen and Yong Shen and Lixun Cao and Heng Zhang and Hongde Li and Ming Li and Yue Ma and Lei Zhang and Jian Liu and Guanghui Zhang and Fei Liu and Jianjun Chen", title = "{LavaStore}: {ByteDance}'s Purpose-Built, High-Performance, Cost-Effective Local Storage Engine for Cloud Services", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3799--3812", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685807", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Persistent key-value (KV) stores are widely used by cloud services at ByteDance as local storage engines, and RocksDB used to be the de facto implementation since it can be tailored to a variety of workloads and requirements. In this paper, we provide \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Paduroiu:2024:MSP, author = "Andrei Paduroiu and Sungheun Wi and Yan Yan and Roni Burd and Ruhollah Farchtchi and Giovanni Matteo Fumarola", title = "{Membrane} --- Safe and Performant Data Access Controls in {Apache Spark} in the Presence of Imperative Code", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3813--3826", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685808", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data Governance is an increasingly critical feature of modern cloud database systems, enabling administrators to set granular access policies on their data. AWS customers want to define row or column filtering on their blob storage data and access it \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ahn:2024:ECM, author = "Minseon Ahn and Thomas Willhalm and Norman May and Donghun Lee and Suprasad Mutalik Desai and Daniel Booss and Jungmin Kim and Navneet Singh and Daniel Ritter and Oliver Rebholz", title = "An Examination of {CXL} Memory Use Cases for In-Memory Database Management Systems Using {SAP HANA}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3827--3840", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685809", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "CXL-based disaggregated memory systems offer options to expand the memory beyond the limits of a single server via cache-coherent memory expansion cards or memory pools. Especially, In-Memory Database Management Systems (IMDBMSs) can benefit from \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yi:2024:KSK, author = "Peng Yi and Lei Liang and Da Zhang and Yong Chen and Jinye Zhu and Xiangyu Liu and Kun Tang and Jialin Chen and Hao Lin and Leijie Qiu and Jun Zhou", title = "{KGFabric}: a Scalable Knowledge Graph Warehouse for Enterprise Data Interconnection", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3841--3854", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685810", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Based on the diversified application scenarios at Ant Group, we built the Ant Knowledge Graph Platform (AKGP). It has constructed numerous domain-specific knowledge graphs related to merchants, companies, accounts, products, and more. AKGP manages \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bianchi:2024:DTU, author = "Alexander Bianchi and Andrew Chai and Vincent Corvinelli and Parke Godfrey and Jarek Szlichta and Calisto Zuzarte", title = "{Db2une}: Tuning Under Pressure via Deep Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3855--3868", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685811", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern database systems including IBM Db2 have numerous parameters, ``knobs,'' that require precise configuration to achieve optimal workload performance. Even for experts, manually ``tuning'' these knobs is a challenging process. We present Db2une, an \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2024:TTD, author = "Yuxing Chen and Anqun Pan and Hailin Lei and Anda Ye and Shuo Han and Yan Tang and Wei Lu and Yunpeng Chai and Feng Zhang and Xiaoyong Du", title = "{TDSQL}: Tencent Distributed Database System", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3869--3882", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685812", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed databases have become indispensable in contemporary computing and data processing, owing to their pivotal role in ensuring high availability and scalability. They effectively cater to the requirements of data management and high-concurrency \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Januschowski:2024:FFS, author = "Tim Januschowski and Yuyang Wang and Jan Gasthaus and Syama Rangapuram and Caner T{\"u}rkmen and Jasper Zschiegner and Lorenzo Stella and Michael Bohlke-Schneider and Danielle Maddix and Konstantinos Benidis and Alexander Alexandrov and Christos Faloutsos and Sebastian Schelter", title = "A Flexible Forecasting Stack", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3883--3892", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685813", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Forecasting extrapolates the values of a time series into the future, and is crucial to optimize core operations for many businesses and organizations. Building machine learning (ML)-based forecasting applications presents a challenge though, due to non-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tong:2024:GHP, author = "Bing Tong and Yan Zhou and Chen Zhang and Jianheng Tang and Jing Tang and Leihong Yang and Qiye Li and Manwu Lin and Zhongxin Bao and Jia Li and Lei Chen", title = "{Galaxybase}: a High Performance Native Distributed Graph Database for {HTAP}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3893--3905", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685814", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We introduce Galaxybase, a native distributed graph database that addresses the increasing demands for processing large volumes of graph data in diverse industries like finance, manufacturing, and government. Designed to handle the requirements of both \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2024:SEP, author = "Xinying Yang and Cong Yue and Wenhui Zhang and Yang Liu and Beng Chin Ooi and Jianjun Chen", title = "{SecuDB}: an In-Enclave Privacy-Preserving and Tamper-Resistant Relational Database", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3906--3919", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685815", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the escalation in the demand for privacy-preserving and tamper-resistant data management and processing on the public cloud, an increasing number of mainstream databases start to provide always-encrypted and blockchain-like features, including \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2024:ATA, author = "Jun-Peng Zhu and Peng Cai and Kai Xu and Li Li and Yishen Sun and Shuai Zhou and Haihuang Su and Liu Tang and Qi Liu", title = "{AutoTQA}: Towards Autonomous Tabular Question Answering through Multi-Agent Large Language Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3920--3933", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685816", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the growing significance of data analysis, several studies aim to provide precise answers to users' natural language questions from tables, a task referred to as tabular question answering (TQA). The state-of-the-art TQA approaches are limited to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:RTM, author = "Xinchun Zhang and Aqsa Kashaf and Yihan Zou and Wei Zhang and Weibo Liao and Haoxiang Song and Jintao Ye and Yakun Li and Rui Shi and Yong Tian and Wei Feng and Binbin Chen and Zuzhi Chen and Tieying Zhang and Yongping Tang", title = "{ResLake}: Towards Minimum Job Latency and Balanced Resource Utilization in Geo-Distributed Job Scheduling", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3934--3946", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685817", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "At internet scale companies like ByteDance, data is generated and consumed at enormously high speed by many different applications. Achieving low latency on such big data jobs is an important problem. However, the naive approach of aggregating all the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xue:2024:ARQ, author = "Maryann Xue and Yingyi Bu and Abhishek Somani and Wenchen Fan and Ziqi Liu and Steven Chen and Herman van Hovell and Bart Samwel and Mostafa Mokhtar and RK Korlapati and Andy Lam and Yunxiao Ma and Vuk Ercegovac and Jiexing Li and Alexander Behm and Yuanjian Li and Xiao Li and Sriram Krishnamurthy and Amit Shukla and Michalis Petropoulos and Sameer Paranjpye and Reynold Xin and Matei Zaharia", title = "Adaptive and Robust Query Execution for Lakehouses at Scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3947--3959", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685818", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many organizations have embraced the ``Lakehouse'' data management paradigm, which involves constructing structured data warehouses on top of open, unstructured data lakes. This approach stands in stark contrast to traditional, closed, relational databases \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Davisson:2024:TMD, author = "Ed Davisson and Tilo Dickopp and David Gay and Eric Karasuda and Ram Kesavan and Vadim Yushprakh", title = "Transparent Migration from Datastore to Firestore", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3960--3972", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685819", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Datastore was one of Google's first cloud databases, launched initially as part of App Engine, and built over Google's internal Megastore database system. Firestore was launched in 2019, both a re-implementation of Datastore over Google's Spanner \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hang:2024:CPE, author = "Jinquan Hang and Zhiqing Hong and Xinyue Feng and Guang Wang and Dongjiang Cao and Jiayang Qiao and Haotian Wang and Desheng Zhang", title = "{Complex-Path}: Effective and Efficient Node Ranking with Paths in Billion-Scale Heterogeneous Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3973--3986", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685820", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Node ranking in heterogeneous graphs, which quantifies the relative importance of nodes, can often be improved by incorporating information from relevant paths. Graph database and heterogeneous graph neural network (HGNN) are two main approaches to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fang:2024:SSS, author = "Wenjing Fang and Shunde Cao and Guojin Hua and Junming Ma and Yongqiang Yu and Qunshan Huang and Jun Feng and Jin Tan and Xiaopeng Zan and Pu Duan and Yang Yang and Li Wang and Ke Zhang and Lei Wang", title = "{SecretFlow-SCQL}: a Secure Collaborative Query Platform", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "3987--4000", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685821", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the business scenarios at Ant Group, there is a rising demand for collaborative data analysis among multiple institutions, which can promote health insurance, financial services, risk control, and others. However, the increasing concern about privacy \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2024:TMD, author = "Hua Fan and Dachao Fu and Xu Wang and Jiachi Zhang and Chaoji Zuo and Zhengyi Wu and Miao Zhang and Kang Yuan and Xizi Ni and Guocheng Huo and Wenchao Zhou and Feifei Li and Jingren Zhou", title = "Towards Millions of Database Transmission Services in the Cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4001--4013", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685822", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Alibaba relies on its robust database infrastructure to facilitate realtime data access and ensure business continuity despite regional disruptions. To address these operational imperatives, Alibaba developed the Data Transmission Service (DTS), which \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xiong:2024:LSM, author = "Tao Xiong and Yong Wang", title = "Large-Scale Metric Computation in Online Controlled Experiment Platform", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4014--4024", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685823", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Online controlled experiment (also called A/B test or experiment) is the most important tool for decision-making at a wide range of data-driven companies like Microsoft, Google, Meta, etc. Metric computation is the core procedure for reaching a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lei:2024:XSC, author = "Hongyu Lei and Chunhua Li and Ke Zhou and Jianping Zhu and Kezhou Yan and Fen Xiao and Ming Xie and Jiang Wang and Shiyu Di", title = "{X-Stor}: a Cloud-Native {NoSQL} Database Service with Multi-Model Support", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4025--4037", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685824", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In recent years at Tencent, we have observed that the use of multiple NoSQL databases for storing business data with diverse models has led to increased programming and deployment costs, as well as inefficient maintenance and underutilized resources. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Barnhart:2024:RMA, author = "Bradley Barnhart and Marc Brooker and Daniil Chinenkov and Tony Hooper and Jihoun Im and Prakash Chandra Jha and Tim Kraska and Ashok Kurakula and Alexey Kuznetsov and Grant McAlister and Arjun Muthukrishnan and Aravinthan Narayanan and Douglas Terry and Bhuvan Urgaonkar and Jiaming Yan", title = "Resource Management in {Aurora} Serverless", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4038--4050", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685825", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Amazon Aurora Serverless is an on-demand, autoscaling configuration for Amazon Aurora with full MySQL and PostgreSQL compatibility. It automatically offers capacity scale-up/down (i.e., vertical scaling) based on a customer database application's needs. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shute:2024:SPW, author = "Jeff Shute and Shannon Bales and Matthew Brown and Jean-Daniel Browne and Brandon Dolphin and Romit Kudtarkar and Andrey Litvinov and Jingchi Ma and John Morcos and Michael Shen and David Wilhite and Xi Wu and Lulan Yu", title = "{SQL} Has Problems. We Can Fix Them: Pipe Syntax In {SQL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4051--4063", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685826", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "SQL has been extremely successful as the de facto standard language for working with data. Virtually all mainstream database-like systems use SQL as their primary query language. But SQL is an old language with significant design problems, making it \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2024:ATI, author = "Xin Zhao and Jialin Qiao and Xiangdong Huang and Chen Wang and Shaoxu Song and Jianmin Wang", title = "{Apache TsFile}: an {IoT}-Native Time Series File Format", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4064--4076", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685827", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The proliferation of the Internet of Things (IoT) has led to an exponential increase in time series data, distributed and applied in various contexts, demanding a dedicated storage solution. Based on our observations and analysis of IoT production \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shankhdhar:2024:PHB, author = "Pranjal Shankhdhar and Feilong Liu and Jay Narale and James Sun and Rebecca Schlussel and Lyublena Antova", title = "{Presto}'s History-Based Query Optimizer", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4077--4089", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685828", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "An important feature of modern query optimizers is the ability to produce a query plan that is optimal for the underlying data set. This requires the ability to estimate cardinalities and computational costs of intermediate query plan nodes, which is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zou:2024:OCF, author = "Ding Zou and Wei Lu and Zhibo Zhu and Xingyu Lu and Jun Zhou and Xiaojin Wang and Kangyu Liu and Kefan Wang and Renen Sun and Haiqing Wang", title = "{OptScaler}: a Collaborative Framework for Robust Autoscaling in the Cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4090--4103", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685829", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Autoscaling is a critical mechanism in cloud computing, enabling the autonomous adjustment of computing resources in response to dynamic workloads. This is particularly valuable for co-located, long-running applications with diverse workload patterns. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2024:DAA, author = "Joshua Wu and Dixin Tang and Nithin Chalapathi and Tristan Chambers and Julie Ciccolini and Cheryl Phillips and Lisa Pickoff-White and Aditya Parameswaran", title = "Dealing with Acronyms, Abbreviations, and Typos in Real-World Entity Matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4104--4116", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685830", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "String matching is at the core of data cleaning, record matching, and information retrieval. String matching relies on a similarity measure that evaluates the similarity of two strings, regarding the two as a match if their similarity is larger than a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ouyang:2024:LUU, author = "Qianyu Ouyang and Chunhui Shen and Wenlong Yang and Peng Yu and Qiang Xiao and Jianhui Lei and Yadong Chen and Qilu Zhong and Xiang Wang and Yong Lin and Qingyi Meng and Zhicheng Ji and Wei Meng and Cen Zheng and Sheng Wang and Dan Pei and Wei Zhang and Feifei Li and Jingren Zhou", title = "{Lindorm-UWC}: an Ultra-Wide-Column Database for {Internet} of Vehicles", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4117--4129", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685831", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the Internet of Vehicle (IoV) systems, intelligent vehicles generate huge amounts of data that supports diverse services and applications. In practice, database systems are deployed in the cloud to manage data uploaded from the vehicle side and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:DRR, author = "Qinlong Wang and Tingfeng Lan and Yinghao Tang and Bo Sang and Ziling Huang and Yiheng Du and Haitao Zhang and Jian Sha and Hui Lu and Yuanchun Zhou and Ke Zhang and Mingjie Tang", title = "{DLRover-RM}: Resource Optimization for Deep Recommendation Models Training in the Cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4130--4144", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685832", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Deep learning recommendation models (DLRM) rely on large embedding tables to manage categorical sparse features. Expanding such embedding tables can significantly enhance model performance, but at the cost of increased GPU/CPU/memory usage. Meanwhile, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:DPS, author = "Bing Zhang and Vadym Doroshenko and Peter Kairouz and Thomas Steinke and Abhradeep Thakurta and Ziyin Ma and Eidan Cohen and Himani Apte and Jodi Spacek", title = "Differentially Private Stream Processing at Scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4145--4158", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685833", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We design, to the best of our knowledge, the first differentially private (DP) stream aggregation processing system at scale. Our system --- Differential Privacy SQL Pipelines (DP-SQLP) --- is built using a streaming framework similar to Spark streaming, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Okolnychyi:2024:PSR, author = "Anton Okolnychyi and Chao Sun and Kazuyuki Tanimura and Russell Spitzer and Ryan Blue and Szehon Ho and Yufei Gu and Vishwanath Lakkundi and DB Tsai", title = "Petabyte-Scale Row-Level Operations in Data Lakehouses", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4159--4172", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685834", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data lakehouses combine the almost infinite scale and diverse tooling of a data lake with the reliability and functionality of a data warehouse. This paper presents extensions that enhance data lake-houses using Apache Iceberg and Apache Spark with \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shankar:2024:SSD, author = "Shreya Shankar and Haotian Li and Parth Asawa and Madelon Hulsebos and Yiming Lin and J. D. Zamfirescu-Pereira and Harrison Chase and Will Fu-Hinthorn and Aditya G. Parameswaran and Eugene Wu", title = "{spade}: Synthesizing Data Quality Assertions for Large Language Model Pipelines", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4173--4186", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685835", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large language models (LLMs) are being increasingly deployed as part of pipelines that repeatedly process or generate data of some sort. However, a common barrier to deployment are the frequent and often unpredictable errors that plague LLMs. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sakka:2024:SYE, author = "Laith Sakka and Pedro Pedreira and Orri Erling and Masha Basmanova and Kevin Wilfong and Wei He and Xiaoxuan Meng and Krishna Pai and Bikramjeet Vig", title = "Simple (yet Efficient) Function Authoring for Vectorized Engines", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4187--4199", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685836", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Vectorized execution engines process large datasets by decomposing computations into concise (tight) loops, which can be more efficiently executed by modern hardware. Providing loops that are optimal for execution usually adds burden to the software \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ahmed:2024:GSD, author = "Rafi Ahmed and Krishna Kantikiran Pasupuleti and Sriram Tirupattur and Lei Sheng and Hong Su and Mohamed Ziauddin", title = "Grouping, Subsumption, and Disjunctive Join Optimizations in Oracle", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4200--4212", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685837", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query optimization must evolve with new workloads. As analytic and data warehouse workloads become more ubiquitous, optimization techniques that reduce the amount of data processed during query execution, enable shared computation and avoid expensive \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:LDM, author = "Guoliang Li and Xuanhe Zhou and Xinyang Zhao", title = "{LLM} for Data Management", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4213--4216", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685838", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Machine learning techniques have been verified to be effective in optimizing data management systems and are widely researched in recent years. However, traditional small-sized ML models often struggle to generalize to new scenarios, and have limited \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xu:2024:NDD, author = "Quanqing Xu and Chuanhui Yang and Aoying Zhou", title = "Native Distributed Databases: Problems, Challenges and Opportunities", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4217--4220", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685839", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Native distributed databases, crucial for scalable applications, offer transactional and analytical prowess but face data intricacies and network challenges. Under the CAP theorem's constraints, latency and replication issues necessitate creative \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fischer:2024:RTR, author = "Tim Fischer and Denis Hirn and G{\"o}khan Kul", title = "A Reproducible Tutorial on Reproducibility in Database Systems Research", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4221--4224", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685840", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Reproducibility is a key aspect of the scientific method, and it is essential for building trust in the results of research. This tutorial aims to provide concrete guidance on how to leverage containerized reproducibility using Docker for database \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Roy:2024:FPQ, author = "Senjuti Basu Roy and Baruch Schieber and Nimrod Talmon", title = "Fairness in Preference Queries: Social Choice Theories Meet Data Management", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4225--4228", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685841", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a large number (notationally m ) of users' (members or voters) preferences as inputs over a large number of items or candidates (notationally n ), preference queries leverage different preference aggregation methods to aggregate individual \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2024:TSA, author = "Qinghua Liu and Paul Boniol and Themis Palpanas and John Paparrizos", title = "Time-Series Anomaly Detection: Overview and New Trends", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4229--4232", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685842", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Anomaly detection is a fundamental data analytics task across scientific fields and industries. In recent years, an increasing interest has been shown in the application of anomaly detection techniques to time series. In this tutorial, we take a holistic \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Nawab:2024:CDM, author = "Faisal Nawab and Mohammad Sadoghi", title = "Consensus in Data Management: With Use Cases in Edge-Cloud and Blockchain Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4233--4236", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685843", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Consensus is a fundamental problem in distributed systems, involving the challenge of achieving agreement among distributed nodes. It plays a critical role in various distributed data management problems. This tutorial aims to provide a comprehensive \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shen:2024:ETG, author = "Yanyan Shen and Lei Chen and Jingzhi Fang and Xin Zhang and Shihong Gao and Hongbo Yin", title = "Efficient Training of Graph Neural Networks on Large Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4237--4240", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685844", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph Neural Networks (GNNs) have gained significant popularity for learning representations of graph-structured data. Mainstream GNNs employ the message passing scheme that iteratively propagates information between connected nodes through edges. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Carvalho:2024:WPH, author = "Marcos N. L. Carvalho and Alkis Simitsis and Anna Queralt and Oscar Romero", title = "Workload Placement on Heterogeneous {CPU-GPU} Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4241--4244", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685845", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The popularity of heterogeneous CPU-GPU processing has increased considerably in recent years. To efficiently utilize heterogeneous resources, data processing systems depend on an appropriate workload placement strategy to assign the right amount of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:SQO, author = "Xin Zhang and Ahmed Eldawy", title = "Spatial Query Optimization With Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4245--4248", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685846", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query optimization is a key component in database management systems (DBMS) and distributed data processing platforms. Recent research in the database community incorporated techniques from artificial intelligence to enhance query optimization. Various \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pedreira:2024:CDM, author = "Pedro Pedreira and Deepak Majeti and Orri Erling", title = "Composable Data Management: an Execution Overview", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4249--4252", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685847", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The trend of decomposing monolithic data management systems into a stack of reusable components has quickly gained momentum across the industry. Although a series of open-source projects have emerged targeting different layers of the stack, execution \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jiang:2024:SRT, author = "Jiaxin Jiang and Zhen Zhang and Bingqiao Luo and Bingsheng He and Min Chen and WeiYang Wang and Jia Chen", title = "{Spade}: a Real-Time Fraud Detection Framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4253--4256", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685848", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demonstration, we introduce Spade, a sophisticated real-time fraud detection framework adept at navigating the complex transaction graph. Unlike conventional methods that are limited by performance and lack incremental update capabilities, Spade \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shi:2024:DDS, author = "Dingyuan Shi and Bingchen Song and Yuanyuan Zhang and Haolong Yang and Ke Xu", title = "A Data-Driven Spatiotemporal Simulator for Reinforcement Learning Methods", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4257--4260", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685849", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Spatiotemporal applications such as taxi order dispatching and warehouse task scheduling depend critically on the algorithms for operational efficiency. However, the inherent dynamic nature of these applications presents challenges in algorithm design. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qin:2024:BIP, author = "Haoyun Qin and Chenyuan Wu and Mohammad Javad Amiri and Ryan Marcus and Boon Thau Loo", title = "{BFTGym}: an Interactive Playground for {BFT} Protocols", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4261--4264", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685850", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Byzantine Fault Tolerant (BFT) protocols serve as a fundamental yet intricate component of distributed data management systems in untrustworthy environments. BFT protocols exhibit different design principles and performance characteristics under varying \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bonifati:2024:DDT, author = "Angela Bonifati and Yann Ramusat and Filip Murlak and Amela Fejza and Rachid Echahed", title = "{DTGraph}: Declarative Transformations of Property Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4265--4268", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685851", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Current graph query languages, including the standards SQL/PGQ and GQL, define their semantics in terms of sets of tuples. This is largely inadequate for data interoperability tasks such as data migration or data integration which require queries to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kroth:2024:MAB, author = "Brian Kroth and Sergiy Matusevych and Rana Alotaibi and Yiwen Zhu and Anja Gruenheid and Yuanyuan Tian", title = "{MLOS} in Action: Bridging the Gap Between Experimentation and Auto-Tuning in the Cloud", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4269--4272", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685852", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents MLOS (ML Optimized Systems), a flexible framework that bridges the gap between benchmarking, experimentation, and optimization of software systems. It allows users to create one-click benchmarking and experimentation scenarios for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schelter:2024:SRC, author = "Sebastian Schelter and Stefan Grafberger and Maarten de Rijke", title = "{Snarcase} --- Regain Control over Your Predictions with Low-Latency Machine Unlearning", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4273--4276", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685853", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The ``right-to-be-forgotten'' requires the removal of personal data from trained machine learning (ML) models with machine unlearning. Conducting such unlearning with low latency is crucial for responsible data management. Low-latency unlearning is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2024:DEN, author = "Yiqun Sun and Qiang Huang and Yanhao Wang and Anthony K. H. Tung", title = "{DiversiNews}: Enriching News Consumption with Relevant Yet Diverse News Articles Retrieval", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4277--4280", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685854", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the digital age, where echo chambers on social media and news platforms increasingly shape public opinion, there is a growing need for tools that present news consumers with a broad spectrum of perspectives. To this end, we introduce DiversiNews, a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Light:2024:SED, author = "Dean Light and Ahmad Aiashy and Mahmoud Diab and Daniel Nachmias and Stijn Vansummeren and Benny Kimelfeld", title = "{SpannerLib}: Embedding Declarative Information Extraction in an Imperative Workflow", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4281--4284", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685855", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Document spanners have been proposed as a formal framework for declarative Information Extraction (IE) from text, following IE products from the industry and academia. Over the past decade, the framework has been studied thoroughly in terms of expressive \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Copul:2024:DTT, author = "Roni Copul and Nave Frost and Tova Milo and Kathy Razmadze", title = "Demonstrating {TabEE}: Tabular Embedding Explanations", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4285--4288", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685856", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present TabEE, Tabular Embedding Explanations, a framework designed to generate explanations for interpreting tabular embedding models. Our framework aims to furnish both local and global explanations for the original data, facilitating the detection \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ji:2024:NDR, author = "Daomin Ji and Hui Luo and Zhifeng Bao and Shane Culpepper", title = "Navigating Data Repositories: Utilizing Line Charts to Discover Relevant Datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4289--4292", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685857", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Line charts are fundamental to data analysis and exploration, offering concise visual representations of trends. However, gaining access to the underlying data used to construct these charts is often challenging. In this paper, we describe DDLC (short \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2024:GAA, author = "Wenfei Fan and Daji Li and Peiyu Liang and Shuhao Liu and Yaoshu Wang and Yiming Wang and Min Xie and Runjie Zhang", title = "Graph Association Analyses for Early Drug Discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4293--4296", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685858", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate MedHunter, a system for assisting the early stage of drug development. MedHunter builds a biomedical knowledge graph DDKG by integrating data from eleven biochemical libraries and data banks, and aligning entities from different data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wei:2024:DME, author = "Lindsey Linxi Wei and Chung Yik Edward Yeung and Hongjian Yu and Jingchuan Zhou and Dong He and Magdalena Balazinska", title = "Demonstration of {MaskSearch}: Efficiently Querying Image Masks for Machine Learning Workflows", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4297--4300", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685859", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate MaskSearch, a system designed to accelerate queries over databases of image masks generated by machine learning models. MaskSearch formalizes and accelerates a new category of queries for retrieving images and their corresponding masks \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Halfpap:2024:LDM, author = "Stefan Halfpap and Jan Kossmann and Rainer Schlosser and Volker Markl", title = "Looking Deeply into the Magic Mirror: an Interactive Analysis of Database Index Selection Approaches", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4301--4304", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685860", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Indexes are important data structures for database tuning. However, finding the best indexes for a given workload is challenging. In this demonstration, we present our extensible open-source index selection evaluation platform and the corresponding \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cho:2024:UAP, author = "Whanhee Cho and Anna Fariha", title = "{Utopia}: Automatic Pivot Table Assistant", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4305--4308", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685861", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data summarization is required to comprehend large datasets, and aggregations are effective ways to summarize data. A pivot table is a mechanism to aggregate numerical attributes grouped by categorical attributes and spreadsheet pivot tables are \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ang:2024:TIA, author = "Yihao Ang and Yifan Bao and Qiang Huang and Anthony K. H. Tung and Zhiyong Huang", title = "{TSGAssist}: an Interactive Assistant Harnessing {LLMs} and {RAG} for Time Series Generation Recommendations and Benchmarking", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4309--4312", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685862", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Time Series Generation (TSG) is essential in many industries for generating synthetic data that mirrors real-world characteristics. TSGBench has advanced the field by offering comprehensive evaluations and unique insights for selecting suitable TSG \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lee:2024:DAD, author = "Wonseok Lee and Jaehyun Ha and Wook-Shin Han and Changgyoo Park and Myunggon Park and Juhyeng Han", title = "{DoppelGanger++} in Action: a Database Replay System with Fast Dependency Graph Generation", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4313--4316", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685863", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A Database Replay System (DRS) captures workloads from a production system and subsequently replays them in a testing environment to verify correctness and performance. Prior to the replay process, DRS initially generates a dependency graph from the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lai:2024:LBS, author = "Eugenie Lai and Yuze Lou and Brit Youngmann and Michael Cafarella", title = "{LucidScript}: Bottom-Up Standardization for Data Preparation", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4317--4320", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685864", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data preparation is an essential step in every data-related effort, from scientific projects in academia to data-driven decision-making in industry. Typically, data preparation is not an interesting piece of a project --- it transforms raw data into a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ghazal:2024:OSG, author = "Ahmad Ghazal and Zhiyuan Liang and Sunny Bains and Hanumath Maduri", title = "{OSSInsight}: Scalable {GitHub} Analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4321--4324", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685865", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "GitHub is a platform hosting code, enabling collaboration, and supporting version control for a global community of over 100 million developers. The need for free tools is crucial for researching open-source software. Based on our research, we found out \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gu:2024:IBB, author = "Long Gu and Si Liu and Tiancheng Xing and Hengfeng Wei and Yuxing Chen and David Basin", title = "{IsoVista}: Black-{Box} Checking Database Isolation Guarantees", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4325--4328", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685866", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Transactional isolation is critical to the functional correctness of database management systems (DBMSs). Much effort has recently been devoted to finding isolation bugs and validating isolation fulfilment in production DBMSs. However, there are still \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Khayati:2024:IIE, author = "Mourad Khayati and Quentin Nater and Jacques Pasquier", title = "{ImputeVIS}: an Interactive Evaluator to Benchmark Imputation Techniques for Time Series Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4329--4332", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685867", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the emergence of The Internet of Things (IoT), smart sensors have become abundant in our daily lives. Failures are very common in those devices, leaving the recorded time series with missing blocks of consecutive values. A cottage industry of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:IMM, author = "Mengzhao Wang and Haotian Wu and Xiangyu Ke and Yunjun Gao and Xiaoliang Xu and Lu Chen", title = "An Interactive Multi-Modal Query Answering System with Retrieval-Augmented Large Language Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4333--4336", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685868", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Retrieval-augmented Large Language Models (LLMs) have reshaped traditional query-answering systems, offering unparalleled user experiences. However, existing retrieval techniques often struggle to handle multi-modal query contexts. In this paper, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Giannakouris:2024:DPL, author = "Victor Giannakouris and Immanuel Trummer", title = "{DBG-PT}: a Large Language Model Assisted Query Performance Regression Debugger", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4337--4340", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685869", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper we explore the ability of Large Language Models (LLMs) in analyzing and comparing query plans, and resolving query performance regressions. We present DBG-PT, a query regression debugging framework powered by LLMs. DBG-PT keeps track of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Campbell:2024:RMR, author = "Felix S. Campbell and Julia Stoyanovich and Yuval Moskovitch", title = "{Rodeo}: Making Refinements for Diverse Top-{$K$} Queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4341--4344", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685870", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database queries are commonly used to select and rank items. With the increasing awareness of diversity, ensuring a diverse output (i.e., the representation of different groups in the top- k positions) becomes essential. To address this challenge, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:QDQ, author = "Xin Zhang and Ahmed Eldawy", title = "{QPJVis Demo}: Quality-Boost Progressive Join Query Processing System", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4345--4348", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685871", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Progressive query processing enables data scientists to efficiently analyze and explore large datasets. Data scientists can start further analyses earlier if the progressive result can represent the complete results well. Most progressive processing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{VanNostrand:2024:CEA, author = "Peter M. VanNostrand and Dennis M. Hofmann and Lei Ma and Belisha Genin and Randy Huang and Elke. A. Rundensteiner", title = "Counterfactual Explanation Analytics: Empowering Lay Users to Take Action Against Consequential Automated Decisions", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4349--4352", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685872", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Machine learning is routinely used to automate consequential decisions about users in domains such as finance and healthcare, raising concerns of transparency and recourse for negative outcomes. Existing Explainable AI techniques generate a static \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xu:2024:UUA, author = "Zhenrong Xu and Pengfei Wang and Guoze Xue and Qitong Yan and Shenghao Gong and Yelan Jiang and Yuren Mao and Yunjun Gao and Shu Shen and Wei Zhang and Dan Luo and Lu Chen", title = "{UniView}: a Unified Autonomous Materialized View Management System for Various Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4353--4356", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685873", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Materialized views (MVs) are critical for improving query performance of database systems, especially in online analytical processing (OLAP) databases. Typically, MVs are maintained by DBAs, which relies on prior knowledge and manual operations. Recently,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yao:2024:DTT, author = "Yuanyuan Yao and Shenjia Dai and Yilin Li and Lu Chen and Dimeng Li and Yunjun Gao and Tianyi Li", title = "A Demonstration of {TENDS}: Time Series Management System Based on Model Selection", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4357--4360", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685874", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The growth in sensor technologies, IoT devices, and information systems has opened up new opportunities for managing time series data across various domains. Despite significant progress, existing time series management systems face two crucial \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Althaus:2024:SEE, author = "Luca Althaus and Mourad Khayati and Abdelouahab Khelifati and Anton Dign{\"o}s and Djellel Difallah and Philippe Cudr{\'e}-Mauroux", title = "{SEER}: an End-to-End Toolkit for Benchmarking Time Series Database Systems in Monitoring Applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4361--4364", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685875", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Time series database systems (TSDBs) are prevalent in many applications ranging from monitoring and IoT devices to scientific research. Those systems are specifically designed to efficiently manage data indexed by time. Because of the variety of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xue:2024:DDG, author = "Siqiao Xue and Danrui Qi and Caigao Jiang and Fangyin Cheng and Keting Chen and Zhiping Zhang and Hongyang Zhang and Ganglin Wei and Wang Zhao and Fan Zhou and Hong Yi and Shaodong Liu and Hongjun Yang and Faqiang Chen", title = "Demonstration of {DB-GPT}: Next Generation Data Interaction System Empowered by Large Language Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4365--4368", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685876", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The recent breakthroughs in large language models (LLMs) are positioned to transition many areas of software. In this paper, we present DB-GPT, a revolutionary and product-ready Python library that integrates LLMs into traditional data interaction tasks \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:DQS, author = "Zheng Zhang and Zhuhan Shao and Andrew Crotty", title = "{DeepSketch}: a Query Sketching Interface for Deep Time Series Similarity Search", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4369--4372", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685877", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "By empowering domain experts to perform interactive exploration of large time series datasets, sketch-based query interfaces have revitalized interest in the well-studied problem of time series similarity search. In this new interaction paradigm, recent \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bao:2024:RCD, author = "Zian Bao and Binbin Bie and Wenfei Fan and Daji Li and Mengyun Li and Kaiwen Lin and Wei Lin and Peijie Liu and Peng Liu and Zhicong Lv and Mingliang Ouyang and Chenyang Sun and Shuai Tang and Yaoshu Wang and Qiyuan Wei and Xiangqian Wu and Min Xie and Jing Zhang and Runxiao Zhao and Jie Zhu and Yilin Zhu", title = "{Rock}: Cleaning Data with both {ML} and Logic Rules", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4373--4376", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685878", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate Rock, a system for cleaning relational data. Rock highlights the following unique features: (1) it extends logic rules by embedding machine learning models as predicates, to benefit from both ML and logic deduction; (2) it supports entity \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ding:2024:CDC, author = "Xiaoou Ding and Yichen Song and Hongzhi Wang and Donghua Yang and Chen Wang and Jianmin Wang", title = "{Clean4TSDB}: a Data Cleaning Tool for Time Series Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4377--4380", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685879", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Billions of data points are generated by devices equipped with thousands of sensors, leading to significant data quality issues in time series data. These errors not only complicate time series data management but also compromise the accuracy and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chai:2024:LEE, author = "Chengliang Chai and Yuhao Deng and Yutong Zhan and Ziqi Cao and Yuanfang Zhang and Lei Cao and Yuping Wang and Zhiwei Zhang and Ye Yuan and Guoren Wang and Nan Tang", title = "{LakeCompass}: an End-to-End System for Data Maintenance, Search and Analysis in Data Lakes", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4381--4384", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685880", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Searching tables from poorly maintained data lakes has long been recognized as a formidable challenge in the realm of data management. There are three pivotal tasks: keyword-based, joinable and unionable table search, which form the backbone of tasks \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yu:2024:DSG, author = "Jianzhe Yu and Wei Dong and Juanru Fang and Dajun Sun and Ke Yi", title = "{DOP-SQL}: a General-Purpose, High-Utility, and Extensible Private {SQL} System", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4385--4388", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685881", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Differential privacy (DP) has garnered significant attention from both academia and industry due to its potential in offering robust privacy protection for individual data during analysis. With the increasing volume of sensitive information being \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mei:2024:CCA, author = "Baolong Mei and Yafei Li and Wei Chen and Linshen Luan and Guanglei Zhu and Yuanyuan Jin and Jianliang Xu", title = "{Catcher}: a Cache Analysis System for Top-$k$ Pub\slash Sub Service", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4389--4392", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685882", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Top- k Publish/Subscribe (TkPS) service is widely studied in spatial database, with various cache-based methods proposed to address its efficiency challenge in top- k result maintenance. These methods require in-depth exploration of relationships between \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Vasileiadis:2024:ODT, author = "Sotiris Vasileiadis and Matthew Paraskeva and George Savva and Andreas Efstathiou and Edson Ramiro Lucas Filho and Jianqiang Shen and Lun Yang and Kebo Fu and Herodotos Herodotou", title = "Optimizing Distributed Tiered Data Storage Systems with {DITIS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4393--4396", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685883", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern data storage systems are characterized by a distributed architecture as well as the presence of multiple storage tiers and caches. Both system developers and operators are challenged with the complexity of such systems as it is hard to evaluate \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:VVQ, author = "Zhaozhuo Li and Xin Wang and Meng Wang and Yajun Yang and Bohan Li and Dong Han", title = "{VQFT}: a Visual Query Approach Based on Full-Text Search for Knowledge Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4397--4400", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685884", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Existing knowledge graph query approaches, whether traditional textual query languages or visual query languages, have steep learning curves that are unfriendly for non-expert users. This demonstration presents a Visual Query approach based on Full-Text \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2024:CCA, author = "Zhen Zhu and Yibo Wang and Shouqing Yang and Lin Long and Runze Wu and Xiu Tang and Junbo Zhao and Haobo Wang", title = "{CORAL}: Collaborative Automatic Labeling System Based on Large Language Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4401--4404", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685885", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the era of big data, data annotation is integral to numerous applications. However, it is widely acknowledged as a laborious and time-consuming process, significantly impeding the scalability and efficiency of data-driven applications. To reduce the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ni:2024:CEC, author = "Wangze Ni and Yiwei Zhao and Pengze Chen and Lei Chen and Peng Cheng and Chen Jason Zhang", title = "{CMixing}: an Efficient Coin Mixing Platform to Enhance Anonymity in Cryptocurrency Transactions", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4405--4408", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685886", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Coin mixing methods are widely used to enhance anonymity in cryptocurrency transactions by obfuscating the linkages between recipients and senders. Specifically, coin mixing methods combine several users' transactions into a CoinJoin transaction and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Robson:2024:CPL, author = "Eliot W. Robson and Dhemath Reddy and Abhishek K. Umrawal", title = "{CyNetDiff}: a {Python} Library for Accelerated Implementation of Network Diffusion Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4409--4412", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685887", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In recent years, there has been increasing interest in network diffusion models and related problems. The most popular of these are the independent cascade and linear threshold models. Much of the recent experimental work done on these models requires a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fu:2024:EEL, author = "Zhe Fu and Mo Sha and Yiran Li and Huorong Li and Yubing Ma and Sheng Wang and Feifei Li", title = "{EncChain}: Enhancing Large Language Model Applications with Advanced Privacy Preservation Techniques", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4413--4416", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685888", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In response to escalating concerns about data privacy in the Large Language Model (LLM) domain, we demonstrate EncChain, a pioneering solution designed to bolster data security in LLM applications. EncChain presents an all-encompassing approach to data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shahbazi:2024:FSR, author = "Nima Shahbazi and Mahdi Erfanian and Abolfazl Asudeh and Fatemeh Nargesian and Divesh Srivastava", title = "{FairEM360}: a Suite for Responsible Entity Matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4417--4420", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685889", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Entity matching is one of the earliest tasks that occur in the big data pipeline and is alarmingly exposed to unintentional biases that affect the quality of data. Identifying and mitigating the biases that exist in the data or are introduced by the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Naeem:2024:RRB, author = "Zan Ahmad Naeem and Mohammad Shahmeer Ahmad and Mohamed Eltabakh and Mourad Ouzzani and Nan Tang", title = "{RetClean}: Retrieval-Based Data Cleaning Using {LLMs} and Data Lakes", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4421--4424", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685890", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large language models (LLMs) have shown great potential in data cleaning, which is a fundamental task in all modern applications. In this demo proposal, we demonstrate that indeed LLMs can assist in data cleaning, e.g., filling in missing values in a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Solleza:2024:MFT, author = "Franco Solleza and Shihang Li and William Sun and Richard Tang and Malte Schwarzkopf and Nesime Tatbul and Andrew Crotty and David Cohen and Stan Zdonik", title = "{Mach}: Firefighting Time-Critical Issues in Complex Systems Using High-Frequency Telemetry", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4425--4428", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685891", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To understand the complex interactions in modern software, engineers often rely on high-frequency telemetry (HFT) data generated via tools like eBPF. However, today's database systems are too slow for HFT's rate and volume and cannot process HFT within \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2024:SDZ, author = "Renzhi Wu and Pramod Chunduri and Dristi J Shah and Ashmitha Julius Aravind and Ali Payani and Xu Chu and Joy Arulraj and Kexin Rong", title = "{SketchQL} Demonstration: Zero-Shot Video Moment Querying with Sketches", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4429--4432", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685892", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we will present SketchQL, a video database management system (VDBMS) for retrieving video moments with a sketch-based query interface. This novel interface allows users to specify object trajectory events with simple mouse drag-and-drop \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2024:DIS, author = "Yiding Zhu and Hongwei Zhang and Jiayao Zhang and Jinfei Liu and Kui Ren", title = "{DataPrice}: an Interactive System for Pricing Datasets in Data Marketplaces", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4433--4436", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685893", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the flourishing of data-driven applications, data marketplaces, which can dramatically facilitate data utilization, have emerged recently. However, determining the appropriate price for datasets presents a significant challenge due to the intangible \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2024:DVE, author = "Pinhan Zhao and Yang He and Xinyu Wang and Yuepeng Wang", title = "Demonstration of the {VeriEQL} Equivalence Checker for Complex {SQL} Queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4437--4440", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685894", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Equivalence checking for SQL queries has many real-world applications but typically requires supporting an expressive SQL language in order to be practical. We develop VeriEQL, a system that can prove and disprove equivalence of complex SQL queries. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2024:FSS, author = "Zeqi Zhu and Zeheng Fan and Yuxiang Zeng and Yexuan Shi and Yi Xu and Mengmeng Zhou and Jin Dong", title = "{FedSQ}: a Secure System for Federated Vector Similarity Queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4441--4444", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685895", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Vector databases have emerged as crucial tools for managing and retrieving representation embeddings of unstructured data. Given the explosive growth of data, vector data is often distributed and stored across multiple organizations. However, privacy \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wei:2024:FPF, author = "Shuyue Wei and Yuanyuan Zhang and Zimu Zhou and Tianlong Zhang and Ke Xu", title = "{FedSM}: a Practical Federated Shared Mobility System", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4445--4448", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685896", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Shared mobility leverages under-utilized vehicles to offer on-demand transport services by sharing vehicles among users. It strives to match supply with demand via a series of data-intensive operations such as supply prediction and task assignment. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{vanRenen:2024:DSD, author = "Alexander van Renen and Mihail Stoian and Andreas Kipf", title = "{DataLoom}: Simplifying Data Loading with {LLMs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4449--4452", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685897", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Schema discovery and data loading is a crucial step in any data analysis pipeline. While this used to be a rare task, in the highly dynamic field of machine learning and modern business intelligence on top of data lakes, today it has become a frequent, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xu:2024:DVT, author = "Jie Jeff Xu and Saahir Dhanani and Jorge Piazentin Ono and Wenbin He and Liu Ren and Kexin Rong", title = "Demonstration of {VCR}: a Tabular Data Slicing Approach to Understanding Object Detection Model Performance", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4453--4456", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685898", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this demonstration, we present VCR, an automated slice discovery method (SDM) for object detection models that helps practitioners identify and explain specific scenarios in which their models exhibit systematic errors. VCR leverages the capabilities \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:MPA, author = "Mengying Wang and Hanchao Ma and Sheng Guan and Yiyang Bian and Haolai Che and Abhishek Daundkar and Alp Sehirlioglu and Yinghui Wu", title = "{ModsNet}: Performance-Aware Top- k Model Search Using Exemplar Datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4457--4460", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685899", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate ModsNet, a search tool for pre-trained data science MODels recommendatioN using Examplar daTaset. Given a set of pre-trained data science models, an ``example'' input dataset, and a user-specified performance metric, ModsNet answers the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jiang:2024:OWO, author = "Linshan Jiang and Moming Duan and Bingsheng He and Yulin Sun and Peishen Yan and Yang Hua and Tao Song", title = "{OFL-W3}: a One-Shot Federated Learning System on {Web} 3.0", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4461--4464", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685900", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Federated Learning (FL) addresses the challenges posed by data silos, which arise from privacy, security regulations, and ownership concerns. Despite these barriers, FL enables these isolated data repositories to participate in collaborative learning \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gao:2024:SDD, author = "Chang Gao and Tianlong Zhang and Yuxiang Zeng and Yi Xu and Shuyuan Li and Yuanyuan Zhang", title = "{Swift}: a Data-Driven Flight Planning System at Scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4465--4468", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685901", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Flight planning, a pivotal challenge in the airline industry, strives to achieve economic and flexible scheduling of airplanes to serve designated flight itineraries. As the demand for air transportation soars, traditional planning methods can be \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Abdelhafeez:2024:PGS, author = "Laila Abdelhafeez and Andres Calderon-Romero and Amr Magdy and Vassilis J. Tsotras", title = "{Pyneapple-G}: Scalable Spatial Grouping Queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4469--4472", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685902", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper demonstrates Pynapple-G, an open-source library for scalable spatial grouping queries based on Apache Sedona (formerly known as GeoSpark). We demonstrate two modules, namely, SGPAC and DDCEL, that support grouping points, grouping lines, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Elyashiv:2024:PEU, author = "Itay Elyashiv and Amir Gilad and Edna Isakov and Tal Tikochinsky and Amit Somech", title = "{PD-Explain}: a Unified Python-Native Framework for Query Explanations Over {DataFrames}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4473--4476", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685903", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Interfaces that rely on the Python programming language have become a popular tool for data analysis and exploration. In particular, the Pandas library allows users to query, manipulate, and visualize data in an easy and intuitive manner. However, users \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Guan:2024:HDS, author = "Jiawei Guan and Feng Zhang and Yuxin Tang and Weitang Ye and Xiaoyong Du", title = "{HocoPG}: a Database System with Homomorphic Compression for Text Processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4477--4480", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685904", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Databases employ out-of-line storage and compression strategies to manage extensive text data. However, the growth in both the size of individual data items and overall data volume has significantly increased the burden of decompression, adversely \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2024:CAI, author = "Xinyang Zhao and Xuanhe Zhou and Guoliang Li", title = "{Chat$2$Data}: an Interactive Data Analysis System with {RAG}, Vector Databases and {LLMs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4481--4484", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685905", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Traditional data analysis methods require users to write programming codes or issue SQL queries to analyze the data, which are inconvenient for ordinary users. Large language models (LLMs) can alleviate these limitations by enabling users to interact \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2024:PSM, author = "Shuhao Liu and Yang Liu and Wenfei Fan", title = "{PrismX}: a Single-Machine System for Querying Big Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4485--4488", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685906", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate PrismX (PRAM with SSDs as Memory eXtension), a single-machine system for graph analytics. PrismX allows users to make practical use of existing PRAM algorithms without any change. To cope with the limited DRAM capacity, it employs NVMe \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liang:2024:TUC, author = "Zhiyu Liang and Chen Liang and Zheng Liang and Hongzhi Wang and Bo Zheng", title = "{TimeCSL}: Unsupervised Contrastive Learning of General Shapelets for Explorable Time Series Analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4489--4492", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685907", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Unsupervised (a.k.a. Self-supervised) representation learning (URL) has emerged as a new paradigm for time series analysis, because it has the ability to learn generalizable time series representation beneficial for many downstream tasks without using \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2024:HHL, author = "Xiangmin Zhou and Chengkun He and Xi Chen and Yanchun Zhang", title = "{HSAP}: a Human-in-the-Loop Social Media-Based Situation Awareness Platform", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4493--4496", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685908", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Situation-awareness (SA) has been important for natural disaster management and smart decision making. Traditionally, security officers recognize disaster situations through emergency reporting with phone calls. However, due to the busy phone lines or \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shen:2024:DMM, author = "Yiqi Shen and Sijia Li and Miaodong Shen and Peng Cai and Weiyuan Xu and Kai Li and Jinlong Cai", title = "{DB-MAGS}: Multi-Anomaly Data Generation System for Transactional Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4497--4500", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685909", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Existing database performance anomaly datasets have the problems of comprehensiveness in anomaly types, coarse-grained root causes, and unrealistic simulation for reproducing concurrent anomalies. To address these issues, we propose a data generation \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2024:QGI, author = "Peizhi Wu and Yi Zhang and Wang-Chiew Tan and Zachary G. Ives", title = "{QuoteInspector}: Gaining Insight about Social Media Discussions", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4501--4504", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685910", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Our greatest source of insight into the real world today is via social media. Here, a major statement or quote by a public figure (world leader, politician, celebrity, scientist) can have wide-ranging impact, igniting extensive discussions and triggering \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:VDW, author = "Jianguo Wang and Eric Hanson and Guoliang Li and Yannis Papakonstantinou and Harsha Simhadri and Charles Xie", title = "Vector Databases: What's Really New and What's Next? ({VLDB 2024} Panel)", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4505--4506", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685911", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Vector databases have recently emerged as a hot topic in the field of databases, especially in industry. This is due to the widespread interest in Large Language Models (LLMs), where vector databases provide the relevant context for LLMs to produce more \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:HPS, author = "Fusheng Wang and Rubao Lee and Dejun Teng and Xiaodong Zhang and Joel Saltz", title = "High-Performance Spatial Data Analytics: Systematic {R\&D} for Scale-Out and Scale-Up Solutions from the Past to Now", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4507--4520", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685912", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We released open-source software Hadoop-GIS in 2011, and presented and published the work in VLDB 2013. This work initiated the development of a new spatial data analytical ecosystem characterized by its large-scale capacity in both computing and data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Amer-Yahia:2024:IAD, author = "Sihem Amer-Yahia", title = "Intelligent Agents for Data Exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4521--4530", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685913", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data Exploration is an incremental process that helps users express what they want through a conversation with the data. Reinforcement Learning (RL) is one of the most notable approaches to automate data exploration and several solutions have been \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kumar:2024:RDL, author = "Arun Kumar", title = "Reimagining Deep Learning Systems through the Lens of Data Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4531--4535", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685914", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The high-profile success of Deep Learning (DL) at Big Tech companies, including recent Large Language Models (LLMs) such as the GPT and Llama families, has led to high demand among Web companies, consumer app companies, enterprises, healthcare, domain \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Leis:2024:LHP, author = "Viktor Leis", title = "{LeanStore}: a High-Performance Storage Engine for {NVMe SSDs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4536--4545", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685915", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Neither traditional disk-based database systems nor modern inmemory database systems are capable of fully exploiting modern servers with multiple NVMe SSDs. LeanStore is a high-performance OLTP storage engine specifically optimized for NVMe SSDs and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Madden:2024:DUQ, author = "Samuel Madden and Michael Cafarella and Michael Franklin and Tim Kraska", title = "Databases Unbound: Querying All of the World's Bytes with {AI}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4546--4554", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685916", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Over the past five decades, the relational database model has proven to be a scaleable and adaptable model for querying a variety of structured data, with use cases in analytics, transactions, graphs, streaming and more. However, most of the world's data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xiao:2024:SID, author = "Xiaokui Xiao", title = "Sharing Information with Differential Privacy: a Database Perspective ({VLDB 2024} Keynote)", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4555--4555", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685917", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the digital age, the widespread collection and analysis of data pose significant privacy challenges. Differential privacy (DP) has emerged as a leading framework for ensuring that information release does not compromise individual privacy. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ozcan:2024:HMD, author = "Fatma Ozcan", title = "Harmonizing {ML} and Databases: a Symphony of Data ({VLDB 2024} Keynote)", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "12", pages = "4556--4556", month = aug, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3685800.3685918", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 16:04:43 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large language models (LLMs) are rapidly transforming the landscape of computing and daily life, demonstrating immense potential across diverse applications like natural language processing, machine translation, and code generation. This talk delves into \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fruth:2024:CDL, author = "Michael Fruth and Stefanie Scherzinger", title = "The Case for {DBMS} Live Patching", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4557--4570", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704966", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704966", abstract = "Traditionally, when the code of a database management system (DBMS) needs to be updated, the system is restarted, and database clients suffer downtime, or the provider instantiates hot-standby instances and rolls over the workload. We investigate a third \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:TTB, author = "Guanghua Li and Hao Zhang and Xibo Sun and Qiong Luo and Yuanyuan Zhu", title = "{TenGraph}: a Tensor-Based Graph Query Engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4571--4584", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704967", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704967", abstract = "We propose a novel tensor-based approach to in-memory graph query processing. Tensors are multi-dimensional arrays, and have been utilized as data units in deep learning frameworks such as TensorFlow and PyTorch. Through tensors, these frameworks \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chan:2024:LLA, author = "Tsz Nam Chan and Bojian Zhu and Dingming Wu and Yun Peng and Leong Hou U.", title = "{LARGE}: a Length-Aggregation-Based Grid Structure for Line Density Visualization", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4585--4598", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704968", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704968", abstract = "Line Density Visualization (LDV) is an important operation of geospatial analysis, which has been extensively used in many application domains, e.g., urban planning, criminology, and transportation science. However, LDV is computationally demanding. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:CCC, author = "Haoran Zhang and Shuai Mu and Sebastian Angel and Vincent Liu", title = "{CausalMesh}: a Causal Cache for Stateful Serverless Computing", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4599--4613", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704969", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704969", abstract = "Stateful serverless workflows consist of multiple serverless functions that access state on a remote database. Developers sometimes add a cache layer between the serverless runtime and the database to improve I/O latency. However, in a serverless \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bellomarini:2024:VPS, author = "Luigi Bellomarini and Davide Benedetto and Matteo Brandetti and Emanuel Sallinger and Adriano Vlad", title = "The {Vadalog} Parallel System: Distributed Reasoning with {Datalog+/-}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4614--4626", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704970", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704970", abstract = "Over the past years, there has been a growing demand for ontological reasoning systems based on languages of the Datalog+/- family, such as Vadalog, for their ability to effectively model a wide range of real-world problems with powerful features such as \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xiu:2024:PPA, author = "Haibo Xiu and Pankaj K. Agarwal and Jun Yang", title = "{PARQO}: Penalty-Aware Robust Plan Selection in Query Optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4627--4640", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704971", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704971", abstract = "The effectiveness of a query optimizer relies on the accuracy of selectivity estimates. The execution plan generated by the optimizer can be extremely poor in reality due to uncertainty in these estimates. This paper presents PARQO (Penalty-Aware Robust \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xu:2024:IRK, author = "Zezhong Xu and Yincen Qu and Wen Zhang and Lei Liang and Huajun Chen", title = "{InBox}: Recommendation with Knowledge Graph using Interest Box Embedding", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4641--4654", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704972", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704972", abstract = "Knowledge graphs (KGs) have become vitally important in modern recommender systems, effectively improving performance and interpretability. Fundamentally, recommender systems aim to identify user interests based on historical interactions and recommend \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{He:2024:BBA, author = "Zongyan He and Jeffrey Xu Yu", title = "A Branch-\&-Bound Algorithm for Fractional Hypertree Decomposition", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4655--4667", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704973", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704973", abstract = "Conjunctive queries ( CQ s) have been widely used in database systems in which acyclic CQ s can be computed efficiently, whereas cyclic CQ s may not. Here, a CQ is acyclic if its hypergraph representation H is acyclic. In order to find a class of CQ s that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:SHQ, author = "Zeyu Wang and Qitong Wang and Xiaoxing Cheng and Peng Wang and Themis Palpanas and Wei Wang", title = "{Steiner}-Hardness: a Query Hardness Measure for Graph-Based {ANN} Indexes", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4668--4682", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704974", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704974", abstract = "Graph-based indexes have been widely employed to accelerate approximate similarity search of high-dimensional vectors. However, the performance of graph indexes to answer different queries varies vastly, leading to an unstable quality of service for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:SME, author = "Yiqi Wang and Long Yuan and Wenjie Zhang and Zi Chen and Xuemin Lin and Qing Liu", title = "Simpler is More: Efficient Top-{$K$} Nearest Neighbors Search on Large Road Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4683--4695", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704975", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704975", abstract = "Top- k Nearest Neighbors ( k NN) problem on road network has numerous applications on location-based services. As direct search using the Dijkstra's algorithm results in a large search space, a plethora of complex-index-based approaches have been proposed \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fischer:2024:SEE, author = "Tim Fischer and Denis Hirn and Torsten Grust", title = "{SQL} Engines Excel at the Execution of Imperative Programs", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4696--4708", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704976", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704976", abstract = "SQL query engines can act as efficient runtime environments for the execution of imperative programs over database-resident tabular data. To make this point, we lay out the details of a compilation strategy that maps the basic blocks of arbitrarily \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yogatama:2024:SYH, author = "Bobbi Yogatama and Weiwei Gong and Xiangyao Yu", title = "Scaling your Hybrid {CPU-GPU} {DBMS} to Multiple {GPUs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4709--4722", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704977", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704977", abstract = "GPU-accelerated databases have been gaining popularity in recent years due to their massive parallelism and high memory bandwidth. The limited GPU memory capacity, however, is still a major bottleneck for GPU databases. Existing approaches have attempted \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Abduvakhobov:2024:SMB, author = "Abduvoris Abduvakhobov and S{\o}ren Kejser Jensen and Torben Bach Pedersen and Christian Thomsen", title = "Scalable Model-Based Management of Massive High Frequency Wind Turbine Data with {ModelarDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4723--4732", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704978", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704978", abstract = "Modern wind turbines are monitored by sensors that generate massive amounts of high frequency time series that are ingested on the edge and then transferred to the cloud where they are stored and analyzed. This results in at least four challenges: (1) \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kashi:2024:ED, author = "Tejasvi Kashi and Kenneth Salem and Jaemyung Kim and Khuzaima Daudjee", title = "Eventual Durability", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4733--4745", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704979", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704979", abstract = "For latency-critical transactional applications, durability is often what limits performance. That is, executing transactions is fast, but guaranteeing that they are durable is slow. As a result, most of each transaction's latency is attributable to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qiu:2024:TTT, author = "Ruizhong Qiu and Jun-Gi Jang and Xiao Lin and Lihui Liu and Hanghang Tong", title = "{TUCKET}: a Tensor Time Series Data Structure for Efficient and Accurate Factor Analysis over Time Ranges", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4746--4759", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704980", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704980", abstract = "Given an evolving tensor time series and multiple time ranges, how can we compute Tucker decomposition for each time range efficiently and accurately? Tucker decomposition has been widely used in a variety of applications to obtain latent factors of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Meng:2024:TPG, author = "Yuchen Meng and Rong-Hua Li and Longlong Lin and Xunkai Li and Guoren Wang", title = "Topology-Preserving Graph Coarsening: an Elementary Collapse-Based Approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4760--4772", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704981", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704981", abstract = "Graph coarsening techniques aim at simplifying the graph structure while preserving key properties in the resulting coarsened graph, have been widely used in graph partitioning and graph neural networks (GNNs). Existing graph coarsening techniques mainly \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2024:ECM, author = "Guanli Liu and Lars Kulik and Christian S. Jensen and Tianyi Li and Renata Borovica-Gajic and Jianzhong Qi", title = "Efficient Cost Modeling of Space-Filling Curves", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4773--4785", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704982", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704982", abstract = "A space-filling curve (SFC) maps points in a multi-dimensional space to one-dimensional points by discretizing the multi-dimensional space into cells and imposing a linear order on the cells. This way, an SFC enables computing a one-dimensional layout \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Reis:2024:GDC, author = "Eduardo Reis and Mohamed Abdelaal and Carsten Binnig", title = "Generalizable Data Cleaning of Tabular Data in Latent Space", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4786--4798", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704983", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704983", abstract = "In this paper, we present a new method for learned data cleaning. In contrast to existing methods, our method learns to clean data in the latent space. The main idea is that we (1) shape the latent space such that we know the area where clean data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Anadiotis:2024:DGD, author = "Angelos Christos Anadiotis and Muhammad Ghufran Khan and Ioana Manolescu", title = "Dynamic Graph Databases with Out-of-Order Updates", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4799--4812", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704984", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704984", abstract = "Several real-time applications rely on dynamic graphs to model and store data arriving from multiple streams. Providing both high ingestion rate and efficient analytics with transactional guarantees is challenging, even more so when updates may be \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zeng:2024:PDD, author = "Lingze Zeng and Naili Xing and Shaofeng Cai and Gang Chen and Beng Chin Ooi and Jian Pei and Yuncheng Wu", title = "Powering In-Database Dynamic Model Slicing for Structured Data Analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4813--4826", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704985", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704985", abstract = "Relational database management systems (RDBMS) are widely used for the storage of structured data. To derive insights beyond statistical aggregation, we typically have to extract specific subdatasets from the database using conventional database \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:GGS, author = "Hongfu Li and Qian Tao and Song Yu and Shufeng Gong and Yanfeng Zhang and Feng Yao and Wenyuan Yu and Ge Yu and Jingren Zhou", title = "{GastCoCo}: Graph Storage and Coroutine-Based Prefetch Co-Design for Dynamic Graph Processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4827--4839", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704986", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704986", abstract = "An efficient data structure is fundamental to meeting the growing demands in dynamic graph processing. However, the dual requirements for graph computation efficiency (with contiguous structures) and graph update efficiency (with linked list-like \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ding:2024:MEC, author = "Xiaoou Ding and Yichen Song and Hongzhi Wang and Chen Wang and Donghua Yang", title = "{MTSClean}: Efficient Constraint-Based Cleaning for Multi-Dimensional Time Series Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4840--4852", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704987", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704987", abstract = "The widespread existence of time series data in information systems poses significant challenges to data cleaning due to its quality issues, particularly the complex interdependencies among attributes and the persistence of errors. Existing semantic \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kiouche:2024:NPG, author = "Abd Errahmane Kiouche and Julien Baste and Mohammed Haddad and Hamida Seba and Angela Bonifati", title = "Neighborhood-Preserving Graph Sparsification", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4853--4866", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704988", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704988", abstract = "We introduce a new graph sparsification method that targets the neighborhood information available for each node. Our approach is motivated by the fact that neighborhood information is used by several mining and learning tasks on graphs as well as \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Urban:2024:EEL, author = "Matthias Urban and Carsten Binnig", title = "{ELEET}: Efficient Learned Query Execution over Text and Tables", journal = j-PROC-VLDB-ENDOWMENT, volume = "17", number = "13", pages = "4867--4880", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3704965.3704989", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:24 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3704965.3704989", abstract = "In this paper, we present ELEET, a novel execution engine that allows one to seamlessly query and process text as a first-class citizen along with tables. To enable such a seamless integration of text and tables, ELEET leverages learned multi-modal \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Arch:2024:KEU, author = "Samuel Arch and Yuchen Liu and Todd C. Mowry and Jignesh M. Patel and Andrew Pavlo", title = "The Key to Effective {UDF} Optimization: Before Inlining, First Perform Outlining", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "1", pages = "1--13", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3696435.3696436", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3696435.3696436", abstract = "Although user-defined functions (UDFs) are a popular way to augment SQL's declarative approach with procedural code, the mismatch between programming paradigms creates a fundamental optimization challenge. UDF inlining automatically removes all UDF calls \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hajidehi:2024:CSG, author = "Milad Rezaei Hajidehi and Sraavan Sridhar and Margo Seltzer", title = "{CUTTANA}: Scalable Graph Partitioning for Faster Distributed Graph Databases and Analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "1", pages = "14--27", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3696435.3696437", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3696435.3696437", abstract = "Graph partitioning plays a pivotal role in various distributed graph processing applications, including graph analytics, graph neural network training, and distributed graph databases. A ``good'' graph partitioner reduces workload execution time, worker \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Moerkotte:2024:CEH, author = "Guido Moerkotte", title = "Cardinality Estimation for Having-Clauses", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "1", pages = "28--41", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3696435.3696438", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3696435.3696438", abstract = "We present several methods for estimating the result cardinality of single table queries with a having clause. More specifically, we provide cardinality estimates for predicates using the aggregate functions count(*), sum(B), avg(B), min(B), and max(B). \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jiang:2024:CHD, author = "Wenqi Jiang and Marco Zeller and Roger Waleffe and Torsten Hoefler and Gustavo Alonso", title = "{Chameleon}: a Heterogeneous and Disaggregated Accelerator System for Retrieval-Augmented Language Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "1", pages = "42--52", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3696435.3696439", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3696435.3696439", abstract = "A Retrieval-Augmented Language Model (RALM) combines a large language model (LLM) with a vector database to retrieve context-specific knowledge during text generation. This strategy facilitates impressive generation quality even with smaller models, thus \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:LRL, author = "Zhaodonghui Li and Haitao Yuan and Huiming Wang and Gao Cong and Lidong Bing", title = "{LLM-R$^2$}: a Large Language Model Enhanced Rule-Based Rewrite System for Boosting Query Efficiency", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "1", pages = "53--65", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3696435.3696440", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3696435.3696440", abstract = "Query rewrite, which aims to improve query efficiency by altering an SQL query's structure without changing its result, has been an important research problem. In order to maintain equivalence between the rewritten query and the original one during \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yu:2024:NBD, author = "Hanfei Yu and Jacob Carter and Hao Wang and Devesh Tiwari and Jian Li and Seung-Jong Park", title = "{Nitro}: Boosting Distributed Reinforcement Learning with Serverless Computing", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "1", pages = "66--79", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3696435.3696441", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3696435.3696441", abstract = "Deep reinforcement learning (DRL) has demonstrated significant potential in various applications, including gaming AI, robotics, and system scheduling. DRL algorithms produce, sample, and learn from training data online through a trial-and-error process, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2024:RET, author = "Silin Zhou and Shuo Shang and Lisi Chen and Christian S. Jensen and Panos Kalnis", title = "{RED}: Effective Trajectory Representation Learning with Comprehensive Information", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "80--92", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705830", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705830", abstract = "Trajectory representation learning (TRL) maps trajectories to vectors that can then be used for various downstream tasks, including trajectory similarity computation, trajectory classification, and travel-time estimation. However, existing TRL methods \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Arpaci-Dusseau:2024:AFA, author = "Anna Arpaci-Dusseau and Zixiang Zhou and Xuhao Chen", title = "Accurate and Fast Approximate Graph Pattern Mining at Scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "93--107", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705831", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705831", abstract = "Approximate graph pattern mining (A-GPM) is an important data analysis tool for numerous graph-based applications. There exist sampling-based A-GPM systems to provide automation and generalization over a wide variety of use cases. Despite improved \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tang:2024:QGD, author = "Xiu Tang and Wenhao Liu and Sai Wu and Chang Yao and Gongsheng Yuan and Shanshan Ying and Gang Chen", title = "{QueryArtisan}: Generating Data Manipulation Codes for Ad-hoc Analysis in Data Lakes", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "108--116", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705832", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705832", abstract = "Query processing over data lakes is a challenging task, often requiring extensive data pre-processing activities such as data cleaning, transformation, and loading. However, the advent of Large Language Models (LLMs) has illuminated a new pathway to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2024:EEA, author = "Yiqian Huang and Shiqi Zhang and Laks V. S. Lakshmanan and Wenqing Lin and Xiaokui Xiao and Bo Tang", title = "Efficient and Effective Algorithms for A Family of Influence Maximization Problems with A Matroid Constraint", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "117--129", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705833", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705833", abstract = "Influence maximization (IM) is a classic problem that aims to identify a small group of critical individuals, known as seeds, who can influence the largest number of users in a social network through word-of-mouth. This problem finds important \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Deeds:2024:CFA, author = "Kyle Deeds and Diandre Sabale and Moe Kayali and Dan Suciu", title = "{Color}: a Framework for Applying Graph Coloring to Subgraph Cardinality Estimation", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "130--143", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705834", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705834", abstract = "Graph workloads pose a particularly challenging problem for query optimizers. They typically feature large queries made up of entirely many-to-many joins with complex correlations. This puts significant stress on traditional cardinality estimation \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2024:FAC, author = "Xinle Wu and Xingjian Wu and Dalin Zhang and Miao Zhang and Chenjuan Guo and Bin Yang and Christian S. Jensen", title = "Fully Automated Correlated Time Series Forecasting in Minutes", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "144--157", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705835", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705835", abstract = "Societal and industrial infrastructures and systems increasingly leverage sensors that emit correlated time series. Forecasting of future values of such time series based on recorded historical values has important benefits. Automatically designed models \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Markakis:2024:LCI, author = "Markos Markakis and Brit Youngmann and Trinity Gao and Ziyu Zhang and Rana Shahout and Peter Baile Chen and Chunwei Liu and Ibrahim Sabek and Michael Cafarella", title = "From Logs to Causal Inference: Diagnosing Large Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "158--172", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705836", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705836", abstract = "Causal inference can quantify cause-effect relationships in domains as varied as medicine, economics and public policy. Production computer systems exhibit a similar level of complexity and a recurring need to diagnose problems quickly. However, systems \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ai:2024:NLB, author = "Xin Ai and Hao Yuan and Zeyu Ling and Qiange Wang and Yanfeng Zhang and Zhenbo Fu and Chaoyi Chen and Yu Gu and Ge Yu", title = "{NeutronTP}: Load-Balanced Distributed Full-Graph {GNN} Training with Tensor Parallelism", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "173--186", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705837", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705837", abstract = "Graph neural networks (GNNs) have emerged as a promising direction. Training large-scale graphs that relies on distributed computing power poses new challenges. Existing distributed GNN systems leverage data parallelism by partitioning the input graph \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2024:UPE, author = "Yuhan Liu and Sheng Wang and Yixuan Liu and Feifei Li and Hong Chen", title = "Unleash the Power of Ellipsis: Accuracy-Enhanced Sparse Vector Technique with Exponential Noise", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "187--199", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705838", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705838", abstract = "The Sparse Vector Technique (SVT) is one of the most fundamental tools in differential privacy (DP). It works as a backbone for adaptive data analysis by answering a sequence of queries on a given dataset, and gleaning useful information in a privacy-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chang:2024:MDC, author = "Lijun Chang", title = "Maximum Defective Clique Computation: Improved Time Complexities and Practical Performance", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "200--212", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705839", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705839", abstract = "k -defective clique is a relaxation of the well-studied clique structure, by allowing up-to k edges missing from a clique. The problem of finding a k -defective clique with the largest number of vertices, although being NP-hard, has been receiving \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tang:2024:SAL, author = "Yanni Tang and Zhuoxing Zhang and Kaiqi Zhao and Lanting Fang and Zhenhua Li and Wu Chen", title = "Substructure-Aware Log Anomaly Detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "213--225", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705840", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705840", abstract = "System logs, recording critical information about system operations, serve as indispensable tools for system anomaly detection. Graph-based methods have demonstrated superior performance compared to other methods in capturing the interdependencies of log \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Miao:2024:LME, author = "Hao Miao and Ziqiao Liu and Yan Zhao and Chenjuan Guo and Bin Yang and Kai Zheng and Christian S. Jensen", title = "Less is More: Efficient Time Series Dataset Condensation via Two-Fold Modal Matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "226--238", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705841", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705841", abstract = "The expanding instrumentation of processes throughout society with sensors yields a proliferation of time series data that may in turn enable important applications, e.g., related to transportation infrastructures or power grids. Machine-learning based \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cheng:2024:MGT, author = "Yunyao Cheng and Chenjuan Guo and Bin Yang and Haomin Yu and Kai Zhao and Christian S. Jensen", title = "A Memory Guided Transformer for Time Series Forecasting", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "239--252", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705842", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705842", abstract = "Accurate long-term forecasting from multivariate time series has important real-world applications. However, achieving this so is challenging. Thus, analyses reveal that time series that span long durations often exhibit dynamic and disrupted \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hu:2024:LLP, author = "Chuxuan Hu and Austin Peters and Daniel Kang", title = "{LEAP}: {LLM}-Powered End-to-End Automatic Library for Processing Social Science Queries on Unstructured Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "253--264", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705843", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705843", abstract = "Social scientists are increasingly interested in analyzing the semantic information (e.g., emotion) of unstructured data (e.g., Tweets), where the semantic information is not natively present. Performing this analysis in a cost-efficient manner requires \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kieu:2024:TTE, author = "Duc Kieu and Tung Kieu and Peng Han and Bin Yang and Christian S. Jensen and Bac Le", title = "{TEAM}: Topological Evolution-Aware Framework for Traffic Forecasting", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "265--278", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705844", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705844", abstract = "Due to the global trend towards urbanization, people increasingly move to and live in cities that then continue to grow. Traffic forecasting plays an important role in the intelligent transportation systems of cities as well as in spatio-temporal data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lee:2024:CSD, author = "Geonho Lee and Jeongho Park and Min-Soo Kim", title = "{Chimera}: a System Design of Dual Storage and Traversal-Join Unified Query Processing for {{SQL\slash} PGQ}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "279--292", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705845", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705845", abstract = "As graphs are being used increasingly in various industries, a new standard of SQL (called SQL:2023) has incorporated SQL with Property Graph Queries (SQL/PGQ) as a core feature. While some approaches process graph queries within RDBMSs using graph view \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Merkel:2024:CGR, author = "Nikolai Merkel and Pierre Toussing and Ruben Mayer and Hans-Arno Jacobsen", title = "Can Graph Reordering Speed Up Graph Neural Network Training? {An} Experimental Study", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "293--307", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705846", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705846", abstract = "Graph neural networks (GNNs) are a type of neural network capable of learning on graph-structured data. However, training GNNs on large-scale graphs is challenging due to iterative aggregations of high-dimensional features from neighboring vertices \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2024:HAR, author = "Xiaoke Zhu and Min Xie and Ting Deng and Qi Zhang", title = "{HyperBlocker}: Accelerating Rule-Based Blocking in Entity Resolution Using {GPUs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "308--321", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705847", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705847", abstract = "This paper studies rule-based blocking in Entity Resolution (ER). We propose HyperBlocker, a GPU-accelerated system for blocking in ER. As opposed to previous blocking algorithms and parallel blocking solvers, HyperBlocker employs a pipelined \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jiang:2024:CNG, author = "Yangfan Jiang and Xinjian Luo and Yin Yang and Xiaokui Xiao", title = "Calibrating Noise for Group Privacy in Subsampled Mechanisms", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "322--334", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705848", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705848", abstract = "Given a group size m and a sensitive dataset D, group privacy (GP) releases information about D (e.g., weights of a neural network trained on D) with the guarantee that the adversary cannot infer with high confidence whether the underlying data is D or a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2024:OFC, author = "Yi Liu and Minghao Xie and Shouqian Shi and Yuanchao Xu and Heiner Litz and Chen Qian", title = "{Outback}: Fast and Communication-Efficient Index for Key--Value Store on Disaggregated Memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "335--348", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705849", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705849", abstract = "Disaggregated memory systems achieve resource utilization efficiency and system scalability by distributing computation and memory resources into distinct pools of nodes. RDMA is an attractive solution to support high-throughput communication between \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mao:2024:MCE, author = "Yunhao Mao and Gengrui Zhang and Zongxin Liu and Pezhman Nasirifard and Sofia Tijanic and Hans-Arno Jacobsen", title = "Making {CRDTs} Not So Eventual", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "349--362", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705850", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705850", abstract = "Conflict-free replicated data types (CRDTs) are highly available and performant data replication solutions for distributed applications. However, their eventual consistency guarantees are often insufficient for ensuring application correctness, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gao:2024:MPS, author = "Shuohao Gao and Kaiqiang Yu and Shengxin Liu and Cheng Long", title = "Maximum $k$-Plex Search: an Alternated Reduction-and-Bound Method", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "363--376", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705851", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705851", abstract = "k -plexes relax cliques by allowing each vertex to disconnect to at most k vertices. Finding a maximum k -plex in a graph is a fundamental operator in graph mining and has been receiving significant attention from various domains. The state-of-the-art \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schafer:2024:DLM, author = "Patrick Sch{\"a}fer and Ulf Leser", title = "Discovering Leitmotifs in Multidimensional Time Series", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "377--389", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705852", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705852", abstract = "A leitmotif is a recurring theme in literature, movies or music that carries symbolic significance for the piece it is contained in. When this piece can be represented as a multi-dimensional time series (MDTS), such as acoustic or visual observations, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2024:SSL, author = "Chuang Yang and Renhe Jiang and Xiaohang Xu and Chuan Xiao and Kaoru Sezaki", title = "{SIMformer}: Single-Layer Vanilla Transformer Can Learn Free-Space Trajectory Similarity", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "390--398", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705853", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705853", abstract = "Free-space trajectory similarity calculation, e.g., DTW, Hausdorff, and Fr{\'e}chet, often incur quadratic time complexity, thus learning-based methods have been proposed to accelerate the computation. The core idea is to train an encoder to transform \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:CCU, author = "Junchang Wang and Manos Athanassoulis", title = "{CUBIT}: Concurrent Updatable Bitmap Indexing", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "399--412", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705854", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705854", abstract = "Bitmap indexes are widely used for read-intensive analytical workloads because they are clustered and offer efficient reads with a small memory footprint. However, they are generally inefficient to update. As analytical applications are increasingly \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ge:2024:PED, author = "Yunqing Ge and Jianbin Qin and Shuyuan Zheng and Yongrui Zhong and Bo Tang and Yu-Xuan Qiu and Rui Mao and Ye Yuan and Makoto Onizuka and Chuan Xiao", title = "Privacy-Enhanced Database Synthesis for Benchmark Publishing", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "413--425", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705855", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705855", abstract = "Benchmarking is crucial for evaluating a DBMS, yet existing benchmarks often fail to reflect the varied nature of user workloads. As a result, there is increasing momentum toward creating databases that incorporate real-world user data to more accurately \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hong:2024:TGA, author = "Kijae Hong and Kyoungmin Kim and Young-Koo Lee and Yang-Sae Moon and Sourav S. Bhowmick and Wook-Shin Han", title = "{Themis}: a {GPU}-Accelerated Relational Query Execution Engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "426--438", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705856", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705856", abstract = "GPU-accelerated relational query execution engines have parallelized the execution of a pipeline, a sequence of operators. For the parallelization, the engines evenly partition the tuples in a table that will be scanned by the pipeline's first operator \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Agmon:2024:FCV, author = "Shunit Agmon and Amir Gilad and Brit Youngmann and Shahar Zoarets and Benny Kimelfeld", title = "Finding Convincing Views to Endorse a Claim", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "439--452", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705857", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705857", abstract = "Recent studies investigated the challenge of assessing the strength of a given claim extracted from a dataset, particularly the claim's potential of being misleading and cherry-picked. We focus on claims that compare answers to an aggregate query posed \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Song:2024:QPC, author = "Yumeng Song and Yu Gu and Tianyi Li and Yushuai Li and Christian S. Jensen and Ge Yu", title = "Quantifying Point Contributions: a Lightweight Framework for Efficient and Effective Query-Driven Trajectory Simplification", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "453--465", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705858", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705858", abstract = "As large volumes of trajectory data accumulate, simplifying trajectories to reduce storage and querying costs is increasingly studied. Existing proposals face three main problems. First, they require numerous iterations to decide which GPS points to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Deng:2024:MGM, author = "Liwei Deng and Tianfu Wang and Yan Zhao and Kai Zheng", title = "{MILLION}: a General Multi-Objective Framework with Controllable Risk for Portfolio Management", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "466--474", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705859", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Sat Mar 1 06:14:26 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3705829.3705859", abstract = "Portfolio management is an important yet challenging task in AI for FinTech, which aims to allocate investors' budgets among different assets to balance the risk and return of an investment. In this study, we propose a general Multi-objectIve framework \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2024:CRS, author = "Yuxi Liu and Fangzhu Shen and Kushagra Ghosh and Amir Gilad and Benny Kimelfeld and Sudeepa Roy", title = "The Cost of Representation by Subset Repairs", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "475--487", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705860", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:49:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Datasets may include errors, and specifically violations of integrity constraints, for various reasons. Standard techniques for ``minimalcost'' database repairing resolve these violations by aiming for a minimum change in the data, and in the process, may \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2024:COU, author = "Mark Zhao and Emanuel Adamiak and Christos Kozyrakis", title = "{cedar}: Optimized and Unified Machine Learning Input Data Pipelines", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "488--502", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705861", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:49:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The input data pipeline is an essential component of each machine learning (ML) training job. It is responsible for reading massive amounts of training data, processing batches of samples using complex transformations, and loading them onto training \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sanghavi:2024:GST, author = "Monil Mukesh Sanghavi and Ming-May Hu and Zhenxiao Luo and Xiao Li and Kapil Bajaj", title = "{Goku}: a Schemaless Time Series Database for Large Scale Monitoring at {Pinterest}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "2", pages = "503--515", month = oct, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3705829.3705862", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:49:29 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Engineers rely heavily on observability tools to monitor their business and system metrics and set up alerting on it. A reliable and efficient monitoring system is very important for development velocity. In this paper, we introduce Goku, a time series \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qiang:2024:AOL, author = "Zhangcheng Qiang and Weiqing Wang and Kerry Taylor", title = "{Agent-OM}: Leveraging {LLM} Agents for Ontology Matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "516--529", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712222", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Ontology matching (OM) enables semantic interoperability between different ontologies and resolves their conceptual heterogeneity by aligning related entities. OM systems currently have two prevailing design paradigms: conventional knowledge-based expert \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:GES, author = "Xue Li and Weibin Zeng and Zhibin Wang and Diwen Zhu and Jingbo Xu and Wenyuan Yu and Jingren Zhou", title = "{GraphAr}: an Efficient Storage Scheme for Graph Data in Data Lakes", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "530--543", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712223", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data lakes, increasingly adopted for their ability to store and analyze diverse types of data, commonly use columnar storage formats like Parquet and ORC for handling relational tables. However, these traditional setups fall short when it comes to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lan:2024:CES, author = "Hai Lan and Shixun Huang and Zhifeng Bao and Renata Borovica-Gajic", title = "Cardinality Estimation for Similarity Search on High-Dimensional Data Objects: The Impact of Reference Objects", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "544--556", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712224", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we study the problem of cardinality estimation for similarity search on high-dimensional data (CE4HD). We aim to perform CE4HD with high data robustness (i.e., robust to different datasets), query robustness (i.e., robust to large \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lee:2024:ETF, author = "Seonho Lee and Yeunjun Lee and Kunsoo Park", title = "Efficient Top-$k$ Frequent Subgraph Mining using Tight Upper and Lower Bounds", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "557--570", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712225", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Frequent subgraph mining is an important and well-studied problem with numerous applications such as the prediction of protein functionalities and graph indexing. Many studies use the minimum-image-based support (MNI) to measure the frequency of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2024:MMS, author = "Hao Liu and Qianwen Yang and Taoyong Cui and Wei Wang", title = "{MSGNN}: Masked Schema based Graph Neural Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "571--584", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712226", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Heterogeneous graph representation learning aims to extract low-dimensional node representations from complex networks with different types of entities and relationships. With the prevalence of heterogeneous information networks (HINs) in real-world \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tahir:2024:HRS, author = "Jawad Tahir and Ruben Mayer and Christoph Doblander and Hans-Arno Jacobsen", title = "How Reliable are Streams? {End}-to-End Processing-Guarantee Validation and Performance Benchmarking of Stream Processing Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "585--598", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712227", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Stream processing systems (SPSs) provide processing guarantees to ensure reliability under failure. However, no related work exists that empirically validates these guarantees. In this paper, we present PGVal, a tool that can end-to-end validate \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2024:TSG, author = "Yinnian Lin and Lei Zou and Xunbin Su", title = "Towards Sufficient {GPU}-Accelerated Dynamic Graph Management: Survey and Experiment", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "599--612", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712228", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Dynamic graph management (DGM) systems are designed to effectively handle changing graph data, which is a fundamental problem for many graph-based applications. Recently, researchers have designed GPU-based solutions for DGM and its downstream \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shang:2024:RDP, author = "Zhuocheng Shang and Samriddhi Singla and Ahmed Eldawy and Elia Scudiero", title = "{RDPro}: Distributed Processing of Big Raster Data: [Scalable Data Science]", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "613--622", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712229", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Advancements in remote sensing technology allowed for collecting vast amounts of satellite and aerial imagery with up to 1 cm pixel resolutions, stored in raster format crucial for various research fields. However, processing this data poses challenges, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:AAD, author = "Qi Zhang and Yalong Zhang and Rong-Hua Li and Guoren Wang", title = "Approximate Anchored Densest Subgraph Search on Large Static and Dynamic Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "623--636", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712230", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Densest subgraph search, aiming to identify a subgraph with maximum edge density, faces limitations as the edge density inadequately reflects biases towards a given vertex set R. To address this, the R -subgraph density was introduced, refining the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zeng:2024:PPM, author = "Tianjing Zeng and Junwei Lan and Jiahong Ma and Wenqing Wei and Rong Zhu and Yingli Zhou and Pengfei Li and Bolin Ding and Defu Lian and Zhewei Wei and Jingren Zhou", title = "{PRICE}: a Pretrained Model for Cross-Database Cardinality Estimation", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "637--650", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712231", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cardinality estimation (CardEst) is essential for optimizing query execution plans. Recent ML-based CardEst methods achieve high accuracy but face deployment challenges due to high preparation costs and lack of transferability across databases. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gilray:2024:DFC, author = "Thomas Gilray and Arash Sahebolamri and Yihao Sun and Sowmith Kunapaneni and Sidharth Kumar and Kristopher Micinski", title = "{Datalog} with First-Class Facts", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "651--665", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712232", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Datalog is a popular logic programming language for deductive reasoning tasks in a wide array of applications, including business analytics, program analysis, and ontological reasoning. However, Datalog's restriction to flat facts over atomic constants \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2024:AED, author = "Junhao Zhu and Tao Wang and Danlei Hu and Ziquan Fang and Lu Chen and Yunjun Gao and Tianyi Li and Christian S. Jensen", title = "{T-Assess}: an Efficient Data Quality Assessment System Tailored for Trajectory Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "666--674", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712233", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the widespread use of GPS-enabled devices and services, trajectory data fuels services in a variety of fields, such as transportation and smart cities. However, trajectory data often contains errors stemming from inaccurate GPS measurements, low \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ye:2024:LLC, author = "Junhao Ye and Jiahui Li and Lu Chen and Yuren Mao and Yunjun Gao and Tianyi Li", title = "{LEAP}: a Low-Cost Spark {SQL} Query Optimizer using Pairwise Comparison", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "675--687", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712234", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Selecting a good execution plan can significantly improve the query efficiency of Spark SQL. Several machine learning-based techniques have been proposed to select good execution plans for DBMS, but none of them perform well on Spark SQL due to the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cao:2024:TPO, author = "Xinle Cao and Weiqi Feng and Jian Liu and Jinjin Zhou and Wenjing Fang and Lei Wang and Quanqing Xu and Chuanhui Yang and Kui Ren", title = "Towards Practical Oblivious Map", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "688--701", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712235", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Oblivious map (OMAP) is an important component in encrypted databases, utilized to prevent the server inferring sensitive information about client's encrypted databases based on access patterns. Despite its widespread usage and importance, existing OMAP \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ruan:2024:PAD, author = "Chaoyi Ruan and Yingqiang Zhang and Juncheng Zhang and Cheng Li and Xiaosong Ma and Hao Chen and Jie Zhou and Feifei Li and Xinjun Yang", title = "{PolyBase}: Adapting to Data Affinity Changes in Geo-Replicated Database via Row-Level Consensus-Group Affiliation Re-Assignment", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "702--714", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712236", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Transaction performance in geo-replicated databases heavily relies on the request location: when not issued by the primary region, transactions are forced to involve costly wide-area communication. While existing systems distribute primary roles across \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2024:EGB, author = "Wenfei Fan and Lihang Fan and Dandan Lin and Min Xie", title = "Explaining {GNN}-Based Recommendations in Logic", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "715--728", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712237", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper proposes Makex (MAKE senSE), a logic approach to explaining why a GNN-based model M ( x, y ) recommends item y to user x. It proposes a class of Rules for ExPlanations, denoted as REPs and defined with a graph pattern Q and dependency X -{$>$} M ( x, y ),. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yin:2024:ECH, author = "Haozhe Yin and Kai Wang and Wenjie Zhang and Ying Zhang and Ruijia Wu and Xuemin Lin", title = "Efficient Computation of Hyper-Triangles on Hypergraphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "729--742", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712238", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Hypergraphs, which use hyperedges to capture groupwise interactions among different entities, have gained increasing attention recently for their versatility in effectively modeling real-world networks. In this paper, we study the problem of computing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2024:LBA, author = "Yuwei Huang and Guoliang Li", title = "{Laser}: Buffer-Aware Learned Query Scheduling in Master-Standby Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "743--755", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712239", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Master-standby database deployment is a commonly adopted database architecture in modern production environments, thanks to its fault tolerance and high availability. However, despite the architecture's widespread application in various online services, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2024:SMS, author = "Yang Liu and Wenfei Fan and Shuhao Liu and Xiaoke Zhu and Jianxin Li", title = "A Single Machine System for Querying Big Graphs with {PRAM}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "756--769", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712240", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper develops Planar (Plug and play PRAM), a single-machine system for graph analytics by reusing existing PRAM algorithms, without the need for designing new parallel algorithms. Planar supports both out-of-core and in-memory analytics. When a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:CGH, author = "Jiajia Li and Yongzhi Chen and Mengxuan Zhang and Lei Li", title = "A {CPU-GPU} Hybrid Labelling Algorithm for Massive Shortest Distance Queries on Road Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "770--783", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712241", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Shortest distance computation is a fundamental operation in graph-related applications, especially in location-based services. The most efficient method is hop-labeling, which can answer queries in microseconds. However, when the traffic condition \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ye:2024:SEE, author = "Fei Ye and Zikang Liu and Xi Zhang and Yinan Jing and Zhenying He and Yuxin Che and Haoran Xiong and Kai Zhang and X. Sean Wang", title = "{SDEcho}: Efficient Explanation of Aggregated Sequence Difference", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "784--797", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712242", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Understanding the reasons behind differences between aggregated sequences derived from SQL queries is crucial for data scientists. However, existing methods often suffer from being labor-intensive, lacking scalability, providing only approximate \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tang:2024:MMB, author = "Qideng Tang and Chaofan Dai and Yahui Wu and Haohao Zhou", title = "{MLP-Mixer} based Masked Autoencoders are Effective, Explainable and Robust for Time Series Anomaly Detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "798--811", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712243", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Time series anomaly detection remains one of the most active research areas in data mining due to its wide range of real-world applications. In recent years, numerous deep learning-based methods have been proposed for this task. However, deep learning-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Deng:2024:EDA, author = "Liwei Deng and Penghao Chen and Ximu Zeng and Tianfu Wang and Yan Zhao and Kai Zheng", title = "Efficient Data-Aware Distance Comparison Operations for High-Dimensional Approximate Nearest Neighbor Search", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "812--821", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712244", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "High-dimensional approximate K nearest neighbor search (AKNN) is a fundamental task for various applications, including information retrieval. Most existing algorithms for AKNN can be decomposed into two main components, i.e., candidate generation and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:SAB, author = "Shijie Zhang and Ru Cheng and Xinpeng Liu and Jiang Xiao and Hai Jin and Bo Li", title = "{Seer}: Accelerating Blockchain Transaction Execution by Fine-Grained Branch Prediction", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "822--835", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712245", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Increasingly popular decentralized applications (dApps) with complex application logic incur significant overhead for executing smart contract transactions, which greatly limits public blockchain performance. Pre-executing transactions off the critical \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yu:2024:PBS, author = "Shangdi Yu and Jessica Shi and Jamison Meindl and David Eisenstat and Xiaoen Ju and Sasan Tavakkol and Laxman Dhulipala and Jakub {\L}{\k{a}}cki and Vahab Mirrokni and Julian Shun", title = "The {ParClusterers Benchmark Suite (PCBS)}: a Fine-Grained Analysis of Scalable Graph Clustering", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "836--849", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712246", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We introduce the ParClusterers Benchmark Suite (PCBS)---a collection of highly scalable parallel graph clustering algorithms and benchmarking tools that streamline comparing different graph clustering algorithms and implementations. The benchmark \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2024:SCT, author = "Shuang Liu and Chenglin Tian and Jun Sun and Ruifeng Wang and Wei Lu and Yongxin Zhao and Yinxing Xue and Junjie Wang and Xiaoyong Du", title = "Semantic Conformance Testing of Relational {DBMS}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "850--862", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712247", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Relational DBMS implementations are expected to adhere to SQL standards. However, there are currently no tools available that can automatically verify this conformance. The main reasons are twofold. First, the SQL standard specification, documented in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mo:2024:RLR, author = "Songsong Mo and Yue Zhao and Zhifeng Bao and Quanqing Xu and Chuanhui Yang and Gao Cong", title = "{RankPQO}: Learning-to-Rank for Parametric Query Optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "863--875", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712248", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Parametric Query Optimization (PQO) is crucial for efficiently handling parametrized queries (PQ) in many database applications. This paper addresses two key challenges in existing PQO techniques, focusing on plan set generation and best plan selection. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hadar:2024:DDT, author = "Aviv Hadar and Tova Milo and Kathy Razmadze", title = "Datamap-Driven Tabular Coreset Selection for Classifier Training", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "876--888", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712249", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the era of data-driven decision-making, efficient machine learning model training is crucial. We present a novel algorithm for constructing tabular data coresets using datamaps created for Gradient Boosting Decision Trees models. The resulting \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{DeMan:2024:TSP, author = "Quinten {De Man} and Laxman Dhulipala and Adam Karczmarz and Jakub {\L}{\k{a}}cki and Julian Shun and Zhongqi Wang", title = "Towards Scalable and Practical Batch-Dynamic Connectivity", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "889--901", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712250", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the problem of dynamically maintaining the connected components of an undirected graph subject to edge insertions and deletions. We give the first parallel algorithm for the problem that is work-efficient, supports batches of updates, runs in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ni:2024:IER, author = "Shengquan Ni and Yicong Huang and Zuozhi Wang and Chen Li", title = "{IcedTea}: Efficient and Responsive Time-Travel Debugging in Dataflow Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "902--914", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712251", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:39:57 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Dataflow systems have an increasing need to support a wide range of tasks in data-centric applications using latest techniques such as machine learning. These tasks often involve custom functions with complex internal states. Consequently, users need \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lee:2024:RTS, author = "Ge Lee and Shixun Huang and Zhifeng Bao and Yanchang Zhao", title = "Representative Time Series Discovery for Data Exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "3", pages = "915--928", month = nov, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3712221.3712252", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:52:49 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this work, we address the critical task of discovering representative time series in exploratory data mining. We define a representative time series, referred to as similarity-bounded representative time series, as one that represents other time \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Song:2024:EGE, author = "Yifan Song and Xiaolong Chen and Wenqing Lin and Jia Li and Chen Zhang and Yan Zhou and Lei Chen and Jing Tang", title = "Efficient Graph Embedding Generation and Update for Large-Scale Temporal Graph", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "929--942", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717756", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph embedding aims at mapping each node to a low-dimensional vector, beneficial for various applications like pattern matching, retrieval augmented generation and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mohammed:2024:FMT, author = "Haneen Mohammed and Alexander Yao and Charlie Summers and Hongbin Zhong and Gromit Yeuk-Yin Chan and Subrata Mitra and Lampros Flokas and Eugene Wu", title = "{FaDE}: More Than a Million What-Ifs Per Second", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "943--955", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717757", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "What-if queries are the building blocks for many explanation and analytics applications-sensitivity analysis, hypothetical reasoning, data cleaning, probabilistic databases-that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2024:TIT, author = "Yuxin Yang and Hongkuan Zhou and Rajgopal Kannan and Viktor Prasanna", title = "Towards Ideal Temporal Graph Neural Networks: Evaluations and Conclusions after 10,000 {GPU} Hours", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "956--969", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717758", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Temporal Graph Neural Networks (TGNNs) have emerged as powerful tools for modeling dynamic interactions across various domains. The design space of TGNNs is notably complex, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2024:KTT, author = "Zhaoheng Li and Supawit Chockchowwat and Ribhav Sahu and Areet Sheth and Yongjoo Park", title = "{Kishu}: Time-Traveling for Computational Notebooks", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "970--985", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717759", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Computational notebooks (e.g., Jupyter, Google Colab) are widely used by data scientists. A key feature of notebooks is the interactive computing model of iteratively executing cells (i.e., \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ma:2024:GGB, author = "Ruiyao Ma and Yifan Zhu and Baihua Zheng and Lu Chen and Congcong Ge and Yunjun Gao", title = "{GTI}: Graph-Based Tree Index with Logarithm Updates for Nearest Neighbor Search in High-Dimensional Spaces", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "986--999", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717760", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Nearest neighbor search (NNS) is fundamental for high-dimensional space retrieval and impacts various fields, such as pattern recognition, information retrieval, recommendation \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kaminsky:2024:IDD, author = "Youri Kaminsky and Eduardo H. M. Pena and Felix Naumann", title = "Incremental Detection of Denial Constraint Violations", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1000--1012", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717761", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Denial constraints (DCs) are well-known to express business rules on data. They subsume other integrity constraints (ICs), such as key constraints or functional dependencies. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chang:2024:RCT, author = "Zhihao Chang and Linzhu Yu and Huan Li and Sai Wu and Gang Chen and Dongxiang Zhang", title = "Revisiting {CNNs} for Trajectory Similarity Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1013--1021", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717762", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Similarity search is a fundamental but expensive operator in querying trajectory data, due to its quadratic complexity of distance computation. To mitigate the computational burden for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chu:2024:MSB, author = "Deming Chu and Zhizhi Gao and Fan Zhang and Wenjie Zhang and Xuemin Lin and Zhihong Tian", title = "Most Similar Biclique Search at Scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1022--1034", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717763", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The biclique is a fundamental model of bipartite cohesive subgraphs. To analyze a bipartite graph, many existing works seek the maximum biclique, that is, the biclique with the largest \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2024:SSA, author = "Chenghong Wang and Lina Qiu and Johes Bater and Yukui Luo", title = "{SPECIAL}: {SynoPsis AssistEd Secure Collaborative AnaLytics}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1035--1048", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717764", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Secure collaborative analytics (SCA) enables the processing of analytical SQL queries across data from multiple owners, even when direct data sharing is not possible. While \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2024:IDO, author = "Qingyin Lin and Jiangsu Du and Rui Li and Zhiguang Chen and Wenguang Chen and Nong Xiao", title = "{IncrCP}: Decomposing and Orchestrating Incremental Checkpoints for Effective Recommendation Model Training", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1049--1062", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717765", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Training large models for modern recommendation systems requires a substantial number of computational devices and extended periods. Since it is essential to store \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Guan:2024:WWS, author = "Naiqing Guan and Nick Koudas", title = "{WeShap}: Weak Supervision Source Evaluation with {Shapley} Values", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1063--1076", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717766", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Efficient data annotation stands as a significant bottleneck in training contemporary machine learning models. The Programmatic Weak Supervision (PWS) pipeline presents a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yu:2024:JLT, author = "Weiping Yu and Fan Wang and Xuwei Zhang and Siqiang Luo", title = "Are Joins over {LSM}-Trees Ready? {Take} {RocksDB} as an Example", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1077--1090", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717767", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "LSM-tree-based data stores are widely adopted in industries for their excellent performance. As data scale increases, disk-based join operations become indispensable yet costly for the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2024:IGS, author = "Zheng Wu and Xuliang Zhu and Yixiang Fang and Jianliang Xu and Xin Huang", title = "Interactive Graph Search for Multiple Targets on {DAGs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1091--1103", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717768", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Interactive graph search (IGS) over DAGs aims to find a hidden target by asking interactive questions as few as possible. IGS is useful for many applications, e.g., facilitating \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xu:2024:AAN, author = "Xianghong Xu and Tieying Zhang and Xiao He and Haoyang Li and Rong Kang and Shuai Wang and Linhui Xu and Zhimin Liang and Shangyu Luo and Lei Zhang and Jianjun Chen", title = "{AdaNDV}: Adaptive Number of Distinct Value Estimation via Learning to Select and Fuse Estimators", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1104--1117", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717769", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Estimating the Number of Distinct Values (NDV) is fundamental for numerous data management tasks, especially within database applications. However, most existing works primarily \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liang:2024:UUI, author = "Anqi Liang and Pengcheng Zhang and Bin Yao and Zhongpu Chen and Yitong Song and Guangxu Cheng", title = "{UNIFY}: Unified Index for Range Filtered Approximate Nearest Neighbors Search", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1118--1130", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717770", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents an efficient and scalable framework for Range Filtered Approximate Nearest Neighbors Search (RF-ANNS) over high-dimensional vectors associated with \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2024:DAD, author = "Yingli Zhou and Qingshuo Guo and Yi Yang and Yixiang Fang and Chenhao Ma and Laks V. S. Lakshmanan", title = "In-Depth Analysis of Densest Subgraph Discovery in a Unified Framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1131--1144", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717771", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As a fundamental topic in graph mining, Densest Subgraph Discovery (DSD) has found a wide spectrum of real applications. Several DSD algorithms, including exact and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhao:2024:SRA, author = "Fuheng Zhao and Shaleen Deep and Fotis Psallidas and Avrilia Floratou and Divyakant Agrawal and Amr {El Abbadi}", title = "{Sphinteract}: Resolving Ambiguities in {NL2SQL} through User Interaction", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1145--1158", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717772", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Translating natural language questions into SQL queries (NL2SQL) is a challenging task of great practical importance. Prior work has extensively studied how to address NL2SQL using \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhuang:2024:NMC, author = "Zhihao Zhuang and Yingying Zhang and Kai Zhao and Chenjuan Guo and Bin Yang and Qingsong Wen and Lunting Fan", title = "Noise Matters: Cross Contrastive Learning for {Flink} Anomaly Detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1159--1168", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717773", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Flink clusters often suffer from hotspot issues where the monitored job delay and CPU usage keep rising and remain high. This necessitates the detection of anomalous time series \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ouyang:2024:RMR, author = "Biao Ouyang and Yingying Zhang and Hanyin Cheng and Yang Shu and Chenjuan Guo and Bin Yang and Qingsong Wen and Lunting Fan and Christian S. Jensen", title = "{RCRank}: Multimodal Ranking of Root Causes of Slow Queries in Cloud Database Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1169--1182", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717774", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the continued migration of storage to cloud database systems, the impact of slow queries in such systems on services and user experience is increasing. Root-cause diagnosis plays \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Abdallah:2024:RID, author = "Hassan Abdallah and B{\'e}atrice Markhoff and Arnaud Soulet", title = "Ranking Indicator Discovery from Very Large Knowledge Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1183--1195", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717775", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Ranking indicators are essential tools for comparing the importance of various entities such as cities or scientists. While extensively used in fields like econometrics and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bajaj:2024:GNN, author = "Saurabh Bajaj and Hojae Son and Juelin Liu and Hui Guan and Marco Serafini", title = "Graph Neural Network Training Systems: a Performance Comparison of Full-Graph and Mini-Batch", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1196--1209", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717776", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph Neural Networks (GNNs) have gained significant attention in recent years due to their ability to learn representations of graph-structured data. Two common methods \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Su:2024:DAI, author = "Qingdong Su and Zhikang Wang and Zijing Tan and Shuai Ma", title = "Discovering Approximate Inclusion Dependencies", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1210--1222", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717777", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Inclusion dependencies (INDs) are widely used in data management tasks. The discovery techniques of INDs have thus received a lot of attention, for discovering INDs valid in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhuang:2024:DLB, author = "Zhutao Zhuang and Xinqi Zeng and Zhiguang Chen", title = "{DumpKV}: Learning Based Lifetime Aware Garbage Collection for Key Value Separation in {LSM-Tree}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1223--1236", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717778", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Key-value separation is used in LSM-tree to store large values in separate log files to reduce write amplification but requires garbage collection to recycle invalid values. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2024:RSA, author = "Boyu Zhang and He Huang and Yu-E Sun and Guoju Gao", title = "{RGS-Sketch}: an Accurate, Invertible, and Mergeable Sketch for Online Super Spreader Detection in High-Speed Data Streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1237--1249", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717779", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Super spreader detection in high-speed data streams is crucial for numerous applications. Although many methods have emerged, existing works can hardly concurrently achieve \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yuan:2024:VOM, author = "Yichao Yuan and Advait Iyer and Lin Ma and Nishil Talati", title = "{Vortex}: Overcoming Memory Capacity Limitations in {GPU-Accelerated} Large-Scale Data Analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "4", pages = "1250--1263", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.14778/3717755.3717780", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:04 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Despite the high computational throughput of GPUs, limited memory capacity and bandwidth-limited CPU-GPU communication via PCIe links remain significant bottlenecks for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gavrielatos:2025:DSC, author = "Vasilis Gavrielatos and Antonios Katsarakis and Chris Jensen and Nikos Ntarmos", title = "{Dandelion}: Smaller Clusters, Bigger Speeds-Distributed Transactions Redefined", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1264--1277", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718058", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents an in-memory, RDMA-enabled, highly-available, transactional Key-Value Store (KVS), dubbed Dandelion, that significantly improves performance in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2025:EES, author = "Xiaoying Wang and Wentao Wu and Vivek Narasayya and Surajit Chaudhuri", title = "{Esc}: an Early-Stopping Checker for Budget-Aware Index Tuning", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1278--1290", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718059", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Index tuning is a time-consuming process. One major performance bottleneck in existing index tuning systems is the large amount of ``what-if'' query optimizer calls that estimate the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2025:JEO, author = "Yilei Wang and Xiangdong Zeng and Sheng Wang and Feifei Li", title = "{Jodes}: Efficient Oblivious Join in the Distributed Setting", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1291--1304", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718060", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Trusted execution environment (TEE) has provided an isolated and secure environment for building cloud-based analytic systems, but it still suffers from access pattern leakages \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2025:OCB, author = "Xunkai Li and Yinlin Zhu and Boyang Pang and Guochen Yan and Yeyu Yan and Zening Li and Zhengyu Wu and Wentao Zhang and Rong-Hua Li and Guoren Wang", title = "{OpenFGL}: a Comprehensive Benchmark for Federated Graph Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1305--1320", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718061", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Federated graph learning (FGL) is a promising distributed training paradigm for graph neural networks across multiple local systems without direct data sharing. This approach \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Langhi:2025:ECQ, author = "Samuele Langhi and Angela Bonifati and Riccardo Tommasini", title = "Evaluating Continuous Queries with Inconsistency Annotations", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1321--1334", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718062", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Continuous Queries (CQs) run indefinitely, processing infinite data streams and producing continuous outputs. They commonly use window functions to segment streams intofi \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2025:MEV, author = "Zhi Wang and Ming Zhong and Yuanyuan Zhu and Tieyun Qian and Mengchi Liu and Jeffrey Xu Yu", title = "On More Efficiently and Versatilely Querying Historical $k$-Cores", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1335--1347", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718063", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The recently proposed historical k -core query introduces a new paradigm of structure analysis for temporal graphs. However, the query processing based on the existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2025:GRL, author = "Yi Li and Gao Cong", title = "{GeoBloom}: Revisiting Lightweight Models for Geographic Information Retrieval", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1348--1361", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718064", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Geographic Information Retrieval (GIR) systems process text queries with geographic location to identify relevant geographic objects for users. Although recent advancements have \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ghasemirad:2025:VVI, author = "Shabnam Ghasemirad and Si Liu and Christoph Sprenger and Luca Multazzu and David Basin", title = "{VerIso}: Verifiable Isolation Guarantees for Database Transactions", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1362--1375", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718065", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Isolation bugs, stemming especially from design-level defects, have been repeatedly found in carefully designed and extensively tested production databases over decades. In \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hong:2025:HAI, author = "Yinhao Hong and Hongyao Zhao and Wei Lu and Xiaoyong Du and Yuxing Chen and Anqun Pan and Lixiong Zheng", title = "A Hybrid Approach to Integrating Deterministic and Non-Deterministic Concurrency Control in Database Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1376--1389", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718066", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Deterministic and non-deterministic concurrency control algorithms have shown respective advantages under diverse workloads. Thus, a natural idea is to blend them together. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Geisler:2025:GMM, author = "Sandra Geisler and Cinzia Cappiello and Irene Celino and David Chaves-Fraga and Anastasia Dimou and Ana Iglesias-Molina and Maurizio Lenzerini and Anisa Rula and Dylan {Van Assche} and Sascha Welten and Maria-Esther Vidal", title = "From Genesis to Maturity: Managing Knowledge Graph Ecosystems Through Life Cycles", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1390--1397", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718067", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Knowledge graphs (KGs) play a crucial role in the integration and organization of heterogeneous data and knowledge, enabling advanced data analytics and decision-making across \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lanzinger:2025:AMG, author = "Matthias Lanzinger and Reinhard Pichler and Alexander Selzer", title = "Avoiding Materialisation for Guarded Aggregate Queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1398--1411", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718068", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Optimising queries with many joins is known to be a hard problem. The explosion of intermediate results as opposed to a much smaller final result poses a serious challenge to modern \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Luo:2025:SCS, author = "Chengyang Luo and Qing Liu and Yunjun Gao and Jianliang Xu", title = "Synergetic Community Search over Large Multilayer Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1412--1424", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718069", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Community search is a fundamental problem in graph analysis and has attracted much attention for its ability to discover personalized communities. In this paper, we focus on \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2025:SDS, author = "Shu Wang and Yixiang Fang and Wensheng Luo", title = "Searching and Detecting Structurally Similar Communities in Large Heterogeneous Information Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1425--1438", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718070", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Heterogeneous information networks (HINs) are prevalent in various domains, including bibliographic information networks, social media, and knowledge graphs. As a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2025:CDW, author = "Gengrui Zhang and Shiquan Zhang and Michail Bachras and Yuqiu Zhang and Hans-Arno Jacobsen", title = "{Cabinet}: Dynamically Weighted Consensus Made Fast", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1439--1452", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718071", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Conventional consensus algorithms, such as Paxos and Raft, encounter inefficiencies when applied to large-scale distributed systems due to the requirement of waiting for replies \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dehghankar:2025:MMU, author = "Mohsen Dehghankar and Abolfazl Asudeh", title = "Mining the {Minoria}: Unknown, Under-Represented, and Under-Performing Minority Groups", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1453--1480", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718072", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Due to a variety of reasons, such as privacy, data in the wild often misses the grouping information required for identifying minorities. On the other hand, it is known that machine \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hou:2025:ECU, author = "Guanhao Hou and Jinchao Huang and Fangyuan Zhang and Sibo Wang", title = "Efficient Concurrent Updates to Persistent Randomized Binary Search Trees", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1481--1494", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718074", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the era of big data, the demand for historical data analytics is growing across various applications. Simultaneously, range queries have been extensively explored within the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ofek:2025:EBB, author = "Sariel Ofek and Amit Somech", title = "Explaining Black-Box Clustering Pipelines with Cluster-Explorer", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1495--1508", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718075", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Explaining the results of clustering pipelines by unraveling the characteristics of each cluster is a challenging task, often addressed manually through visualizations and queries. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2025:BBA, author = "Jianfeng Huang and Yihao Cao and Shubing Ren and Baohua Wu and Dongjing Miao", title = "{BACH}: Bridging Adjacency List and {CSR} Format Using {LSM-Trees} for {HGTAP} Workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1509--1521", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718076", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern data-intensive applications require databases that support fast analytical processing on massive dynamic graphs in real time, while simultaneously providing transactional \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2025:FHP, author = "Hua Fan and Hao Tan and Wenchao Zhou and Feifei Li", title = "{FLEET}: High-Performance Durable Replicated State Machines Using Scattered and Coordinated Log Entries", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1522--1535", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718077", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed coordination services are fundamental components of distributed systems, employing durable replicated state machines (RSMs) to ensure consistency across \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kang:2025:BHD, author = "Guoxin Kang and Zhongxin Ge and Jingpei Hu and Xueya Zhang and Lei Wang and Jianfeng Zhan", title = "{BigVectorBench}: Heterogeneous Data Embedding and Compound Queries are Essential in Evaluating Vector Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "5", pages = "1536--1550", month = jan, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3718057.3718078", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:05 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Vector databases are designed to effectively store, organize, and retrieve high-dimensional vectors, enabling faster and more accurate querying and analysis. This study \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Guan:2025:SSE, author = "Jiawei Guan and Feng Zhang and Jiesong Liu and Xiaoyong Du and Xipeng Shen", title = "A Systematic Study on Early Stopping Metrics in {HPO} and the Implications of Uncertainty", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1551--1564", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725689", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The development of hyperparameter optimization (HPO) algorithms is an important topic within both the machine learning and data management domains. While numerous strategies \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2025:TDP, author = "Haoying Zhang and Mariem Brahem and Nicolas Anciaux and Benjamin Nguyen and Jose Maria de Fuentes", title = "{TELESAFE}: Detecting {Private\slash} Work Boundary Crossings in Energy Consumption Trails in Telework", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1565--1578", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725690", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Teleworking has become a social gain following the COVID-19 lock-downs. In many professions, remote work is becoming a common practice, either at the employee's home or in a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2025:FTM, author = "Yuan Chen and Ao Li and Wenhai Li and Lingfeng Deng", title = "{FB+}-Tree: a Memory-Optimized {B+}-Tree with Latch-Free Update", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1579--1592", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725691", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "B$^+$ -trees are prevalent in traditional database systems due to their versatility and balanced structure. While binary search is typically utilized for branch operations, it may lead to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gong:2025:VDS, author = "Shenghao Gong and Haobo Sun and Ziquan Fang and Liu Liu and Lu Chen and Yunjun Gao", title = "{VStream}: a Distributed Streaming Vector Search System", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1593--1606", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725692", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Vector search is widely employed in recommendation systems, search engines, etc. With the explosive growth of online data and streaming processing engines, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mang:2025:EHB, author = "Qiuyang Mang and Jingbang Chen and Hangrui Zhou and Yu Gao and Yingli Zhou and Qingyu Shi and Richard Peng and Yixiang Fang and Chenhao Ma", title = "Efficient Historical Butterfly Counting in Large Temporal Bipartite Networks via Graph Structure-Aware Index", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1607--1620", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725693", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Bipartite graphs are ubiquitous in many domains, e.g., e-commerce platforms, social networks, and academia, by modeling interactions between distinct entity sets. Within these \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mohanaraj:2025:PPS, author = "Abiram Mohanaraj and Matteo Lissandrini and Katja Hose", title = "{PlanRGCN}: Predicting {SPARQL} Query Performance", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1621--1634", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725694", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query Performance Prediction (QPP) is the task of predicting the query runtime performance prior to its execution. While QPP has been studied in relational database systems, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Davidson:2025:HQA, author = "Susan B. Davidson and Tova Milo and Kathy Razmadze and Gal Zeevi", title = "Holistic Query Approximation via {RL} Modeling", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1635--1648", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725695", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In data exploration, executing queries over a large database can be time-consuming. Previous work has proposed approximate query processing as a way to speed up aggregate queries \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gottesburen:2025:UGP, author = "Lars Gottesb{\"u}ren and Laxman Dhulipala and Rajesh Jayaram and Jakub {\L}{\k{a}}cki", title = "Unleashing Graph Partitioning for Large-Scale Nearest Neighbor Search", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1649--1662", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725696", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We consider the fundamental problem of decomposing a large-scale approximate nearest neighbor search (ANNS) problem into smaller sub-problems. The goal is to partition \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ma:2025:LEB, author = "Jiaming Ma and Binwu Wang and Pengkun Wang and Zhengyang Zhou and Xu Wang and Yang Wang", title = "{BiST}: a Lightweight and Efficient Bi-Directional Model for Spatiotemporal Prediction", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1663--1676", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725697", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "While existing spatiotemporal prediction models have shown promising performance, they often rely on the assumption of input-label spatiotemporal consistency, and their \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{You:2025:QUD, author = "Zhengxin You and Qiaomu Shen and Man Lung Yiu and Bo Tang", title = "{QOVIS}: Understanding and Diagnosing Query Optimizer via a Visualization-Assisted Approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1677--1690", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725698", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Understanding and diagnosing query optimizers is crucial to guarantee the correctness and efficiency of query processing in database systems. However, achieving this is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jacob:2025:UAD, author = "Vincent Jacob and Yanlei Diao", title = "Unsupervised Anomaly Detection in Multivariate Time Series across Heterogeneous Domains", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1691--1704", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725699", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The widespread adoption of digital services, along with the scale and complexity at which they operate, has made incidents in IT operations increasingly more likely, diverse, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fu:2025:NSE, author = "Zhenbo Fu and Xin Ai and Qiange Wang and Yanfeng Zhang and Shizhan Lu and Chaoyi Chen and Chunyu Cao and Hao Yuan and Zhewei Wei and Yu Gu and Yingyou Wen and Ge Yu", title = "{NeutronTask}: Scalable and Efficient Multi-{GPU} {GNN} Training with Task Parallelism", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1705--1719", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725700", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph neural networks (GNNs) have emerged as a promising method for learning from graph data, but large-scale GNN training requires extensive memory and computation resources. To \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hai:2025:QDM, author = "Rihan Hai and Shih-Han Hung and Tim Coopmans and Tim Littau and Floris Geerts", title = "Quantum Data Management in the {NISQ} Era", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1720--1729", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725701", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Quantum computing has emerged as a transformative force in the evolution of computing technology. Recent efforts have applied quantum techniques to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zheng:2025:GVV, author = "Yunjia Zheng and Charlotte Sacr{\'e} and Mohanna Shahrad and Owen Lipchitz and Yu Ting Gu and Bettina Kemme", title = "{G-View}: View Management for Graph Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1730--1742", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725702", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph database systems (GDBS) have become popular for representing real-world entities and their relationships, and offering convenient query languages based on graph pattern \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Du:2025:PFL, author = "Rong Du and Qingqing Ye and Yue Fu and Haibo Hu", title = "Privacy for Free: Leveraging Local Differential Privacy Perturbed Data from Multiple Services", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1743--1755", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725703", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Local Differential Privacy (LDP) has emerged as a widely adopted privacy-preserving technique in modern data analytics, enabling users to share statistical insights while maintaining \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Song:2025:KOD, author = "Haoze Song and Yongqi Wang and Xusheng Chen and Hao Feng and Yazhi Feng and Xieyun Fang and Heming Cui and Linghe Kong", title = "{K2}: On Optimizing Distributed Transactions in a Multi-Region Data Store with {TrueTime} Clocks", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1756--1769", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725704", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "TrueTime clocks (TTCs) that offer accurate and reliable time within limited uncertainty bounds have been increasingly implemented in many clouds. Multi-region data stores that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2025:MIP, author = "Tingyang Chen and Cong Fu and Kun Wang and Xiangyu Ke and Yunjun Gao and Wenchao Zhou and Yabo Ni and Anxiang Zeng", title = "Maximum Inner Product is Query-Scaled Nearest Neighbor", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1770--1783", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725705", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Maximum Inner Product Search (MIPS) for high-dimensional vectors is pivotal across databases, information retrieval, and artificial intelligence. Existing methods either reduce MIPS to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2025:MFE, author = "Rongzhao Chen and Xiangpeng Hu and Xiangdong Huang and Chen Wang and Shaoxu Song and Jianmin Wang", title = "Migration-Free Elastic Storage of Time Series in {Apache IoTDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1784--1797", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725706", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In distributed time series databases (TSDBs), time series data are typically partitioned by both series and time. These partitions are then allocated to shards, whose replicas \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gheerbrant:2025:GSP, author = "Am{\'e}lie Gheerbrant and Leonid Libkin and Liat Peterfreund and Alexandra Rogova", title = "{GQL} and {SQL\slash PGQ}: Theoretical Models and Expressive Power", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1798--1810", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725707", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "SQL/PGQ and GQL are very recent international standards for querying property graphs: SQL/PGQ specifies how to query relational representations of property graphs in SQL, while GQL is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2025:PTG, author = "Peizhi Wu and Haoshu Xu and Ryan Marcus and Zachary G. Ives", title = "A Practical Theory of Generalization in Selectivity Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1811--1824", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725708", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query-driven machine learning models have emerged as a promising estimation technique for query selectivities. Yet, surprisingly little is known about the efficacy of these \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2025:RIC, author = "Shuo Yang and Jiadong Xie and Yingfan Liu and Jeffrey Xu Yu and Xiyue Gao and Qianru Wang and Yanguo Peng and Jiangtao Cui", title = "Revisiting the Index Construction of Proximity Graph-Based Approximate Nearest Neighbor Search", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1825--1838", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725709", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Proximity graphs (PG) have gained increasing popularity as the state-of-the-art solutions to k -approximate nearest neighbor ( k -ANN) search on high-dimensional data, which \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bei:2025:MPP, author = "Yijun Bei and Teng Ma and Dongxiang Zhang and Sai Wu and Kian-Lee Tan and Gang Chen", title = "Mining Platoon Patterns from Traffic Videos", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1839--1851", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725710", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Discovering co-movement patterns from urban-scale video data sources has emerged as an attractive topic. This task aims to identify groups of objects that travel together along a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2025:ASD, author = "Botong Huang and Lianggui Weng and Wei Chen and Kai Zeng and Yihui Feng and Bolin Ding and Jingren Zhou and Zuozhi Wang and Chen Li", title = "{Agamotto}: Scheduling of Deadline-Oriented Incremental Query Execution under Uncertain Resource Price", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1852--1864", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725711", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Incremental query processing is widely used in data warehouses and streaming systems. While many optimization techniques are developed to generate incremental query plans, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cai:2025:SED, author = "Baoqing Cai and Yu Liu and Lin Ma and Pingqi Huang and Bingcheng Lian and Ke Zhou and Jia Yuan and Jie Yang and Xiaofan Cai and Peijun Wu", title = "{SCompression}: Enhancing Database Knob Tuning Efficiency Through Slice-Based {OLTP} Workload Compression", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1865--1878", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725712", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Workload execution can account for 90\% of the total database knob tuning time, which is often the bottleneck for efficient knob tuning in practice. Reducing the tuning time by \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gao:2025:FDT, author = "Xiyue Gao and Zhuang Liu and Yiran Shen and Hui Li and Yingfan Liu and Hongjun Xiao and Yanguo Peng and Jiangtao Cui", title = "{Fucci}: Database Transaction Fuzzing via Random Conflict Construction and Multilevel Constraint Solving", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1879--1891", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725713", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Ensuring the ACID properties of transactions is the fundamental functionality of transactional DBMSs. However, through our study on existing solutions on transaction management, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2025:STS, author = "Wenjing Wang and Ziyang Yue and Bolong Zheng", title = "Streaming Time Series Subsequence Anomaly Detection: a Glance and Focus Approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1892--1904", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725714", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Subsequence anomaly detection for time series is a crucial problem in various real-world applications. However, existing methods proposed so far design the anomaly score \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Du:2025:ISE, author = "Leilei Du and Peng Cheng and Lei Chen and Heng Tao Shen and Xuemin Lin and Wei Xi", title = "Infinite Stream Estimation under Personalized $w$-Event Privacy", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1905--1918", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725715", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Streaming data collection is indispensable for stream data analysis, such as event monitoring. However, publishing these data directly leads to privacy leaks. w -event privacy is a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2025:GGE, author = "Meng Wang and Gus Waldspurger and Naufal Ananda and Yuyang Huang and Kemas Wiharja and John Bent and Swaminathan Sundararaman and Vijay Chidambaram and Haryadi S. Gunawi", title = "{GPEmu}: a {GPU} Emulator for Faster and Cheaper Prototyping and Evaluation of Deep Learning System Research", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1919--1932", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725716", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Deep learning (DL) system research is often impeded by the limited availability and expensive costs of GPUs. In this paper, we introduce GPEmu, a GPU emulator for faster and cheaper \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zeng:2025:CDS, author = "Anna Zeng and Michael Cafarella and Batya Kenig and Markos Markakis and Brit Youngmann and Babak Salimi", title = "Causal {DAG} Summarization", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1933--1947", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725717", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Causal inference aids researchers in discovering cause-and-effect relationships, leading to scientific insights. Accurate causal estimation requires identifying confounding \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ye:2025:MFT, author = "Zhengmao Ye and Dengchun Li and Zetao Hu and Tingfeng Lan and Jian Sha and Shicong Zhang and Lei Duan and Jie Zuo and Hui Lu and Yuanchun Zhou and Mingjie Tang", title = "{mLoRA}: Fine-Tuning {LoRA} Adapters via Highly-Efficient Pipeline Parallelism in Multiple {GPUs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1948--1961", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725718", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Transformer-based large language models (LLMs) have demonstrated outstanding performance across diverse domains, particularly in the emerging pretrain-then-finetune \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kim:2025:ADS, author = "Abigale Kim and Marco Slot and David G. Andersen and Andrew Pavlo", title = "Anarchy in the Database: a Survey and Evaluation of Database Management System Extensibility", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1962--1976", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725719", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:58:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Extensions allow applications to expand the capabilities of database management systems (DBMSs) with custom logic. However, the extensibility environment for some DBMSs is fraught with perils, causing developers to resort to unorthodox methods to achieve \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2025:FFQ, author = "Longxu Sun and Xin Huang and Jiannan Wang and Jianliang Xu", title = "A Flexible Framework for Query-Oriented Interactive Community Search", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1977--1990", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725720", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:58:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Community search finds query-dependent communities over graphs, which has been investigated broadly. In this work, we focus on the task of returning only a single connected community containing all user input query vertices. Most existing studies in the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yan:2025:TEB, author = "Ziyi Yan and Mohamed Farouk Drira and Tianxun Hu and Tianzheng Wang", title = "{Tabular}: Efficiently Building Efficient Indexes", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "6", pages = "1991--2004", month = feb, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3725688.3725721", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 07:58:06 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Concurrent indexes are hard to build by requiring complex, careful yet error-prone processes of design and implementation. As prior work has observed, modeling indexes as transactional tables can largely ease programming. The developer only needs to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zeng:2025:EMH, author = "Yuanyuan Zeng and Yixiang Fang and Kun Chen and Yangfan Li and Chenhao Ma", title = "Efficient Maintenance of 2-Hop Labeling Index on Dynamic Small-World Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2005--2017", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734840", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "2-hop labeling has been widely utilized to accelerate the efficiency of online shortest distance queries. Given the nature of frequent changes in real-world graphs, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2025:VDP, author = "Kexin Zhu and Michael Whittaker and Srdjan Petrovic and Robert Grandl and Sanjay Ghemawat", title = "Vive la Diff{\'e}rence: Practical Diff Testing of Stateful Applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2018--2030", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734841", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Software rollout is the process of replacing the version of an application that is currently running in production with a new version. Many subtle and catastrophic bugs occur \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2025:GGR, author = "Siyue Wu and Dingming Wu and Sinhong Cheuk and Tsz Nam Chan and Kezhong Lu", title = "{GREAT}: Generalized Reservoir Sampling Based Triangle Counting Estimation over Streaming Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2031--2043", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734842", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The number of triangles of a streaming graph is a crucial metric with various applications, such as network evolution analysis, community detection, and anomaly detection. A \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2025:EDR, author = "Mengran Li and Zijing Tan and Honghui Yang and Shuai Ma", title = "Efficient Discovery of Relaxed Functional Dependencies", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2044--2056", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734843", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper studies the discovery of relaxed functional dependencies (RFDs). We consider RFDs that relax restrictions in both value equality and constraint satisfaction: treating \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hu:2025:STS, author = "Danlei Hu and Yilin Li and Lu Chen and Ziquan Fang and Yushuai Li and Yunjun Gao and Tianyi Li", title = "{SimRN}: Trajectory Similarity Learning in Road Networks Based on Distributed Deep Reinforcement Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2057--2069", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734844", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Trajectory similarity computation in road networks is crucial for data analytics. However, both non-learning-based and learning-based methods face challenges. First, they \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Guo:2025:BNL, author = "Yuxiang Guo and Zhonghao Hu and Yuren Mao and Baihua Zheng and Yunjun Gao and Mingwei Zhou", title = "{Birdie}: Natural Language-Driven Table Discovery Using Differentiate Search Index", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2070--2083", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734845", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Natural language (NL)-driven table discovery identifies relevant tables from large table repositories based on NL queries. While current deep-learning-based methods using \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2025:SCO, author = "Shu Liu and Xiangxi Mo and Moshik Hershcovitch and Henric Zhang and Audrey Cheng and Guy Girmonsky and Gil Vernik and Michael Factor and Tiemo Bang and Soujanya Ponnapalli and Natacha Crooks and Joseph E. Gonzalez and Danny Harnik and Ion Stoica", title = "{SkyStore}: Cost-Optimized Object Storage Across Regions and Clouds", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2084--2096", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734846", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern applications span multiple clouds to reduce costs, avoid vendor lock-in, and leverage low-availability resources in another cloud. However, standard object stores \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ren:2025:PCN, author = "Tonghui Ren and Chen Ke and Yuankai Fan and Yinan Jing and Zhenying He and Kai Zhang and X. Sean Wang", title = "The Power of Constraints in Natural Language to {SQL} Translation", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2097--2111", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734847", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Current large language model (LLM)-based Natural Language to SQL (NL2SQL) approaches typically rely on the database schema and partial data values for the translation. These \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sheng:2025:ACE, author = "Yufan Sheng and Xin Cao and Kaiqi Zhao and Yixiang Fang and Jianzhong Qi and Wenjie Zhang and Christian S. Jensen", title = "{ACE}: a Cardinality Estimator for Set-Valued Queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2112--2125", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734848", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cardinality estimation is a fundamental functionality in database systems. Most existing cardinality estimators focus on handling predicates over numeric or categorical \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2025:ABQ, author = "Xiaoying Wang and Jiannan Wang and Tianzheng Wang and Yong Zhang", title = "{Accio}: Bolt-on Query Federation", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2126--2135", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734849", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data scientists today often need to analyze data from various places. This makes it necessary for corresponding engines to support query federation (i.e., the ability to perform \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dai:2025:FAA, author = "Xiaohai Dai and Chaozheng Ding and Wei Li and Jiang Xiao and Bolin Zhang and Chen Yu and Albert Y. Zomaya and Hai Jin", title = "{Falcon}: Advancing Asynchronous {BFT} Consensus for Lower Latency and Enhanced Throughput", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2136--2148", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734850", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Asynchronous Byzantine Fault Tolerant (BFT) consensus protocols have garnered significant attention with the rise of blockchain technology. A typical asynchronous protocol is \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Han:2025:SPT, author = "Jindong Han and Hao Wang and Hui Xiong and Hao Liu", title = "Scalable Pre-Training of Compact Urban Spatio-Temporal Predictive Models on Large-Scale Multi-Domain Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2149--2158", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734851", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Spatio-Temporal Prediction (STP) is crucial for various smart city applications, such as traffic management and resource allocation. However, training samples can be scarce in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2025:HRF, author = "Xiaoyuan Liu and Ni Trieu and Trinabh Gupta and Ishtiyaque Ahmad and Dawn Song", title = "{HADES}: Range-Filtered Private Aggregation on Public Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2159--2171", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734852", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In aggregation queries, predicate parameters often reveal user intent. Protecting these parameters is critical for user privacy, regardless of whether the database is public or \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ji:2025:OBP, author = "Zhaoxuan Ji and Xinlu Wang and Zhaojing Luo and Zhongle Xie and Meihui Zhang", title = "Optimized Batch Prompting for Cost-Effective {LLMs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2172--2184", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734853", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large Language Models (LLMs) have recently demonstrated exceptional performance in various real-world data management tasks through in-context learning (ICL), which involves \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qin:2025:TDH, author = "Hongchao Qin and Guang Zeng and Rong-Hua Li and Longlong Lin and Ye Yuan and Guoren Wang", title = "Truss Decomposition in Hypergraphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2185--2197", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734854", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Truss decomposition is a fundamental approach in graph theory that focuses on uncovering cohesive subgraphs within networks. However, many networks involve groupwise \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2025:OSS, author = "Jianting Zhang and Zhongtang Luo and Raghavendra Ramesh and Aniket Kate", title = "Optimal Sharding for Scalable Blockchains with Deconstructed {SMR}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2198--2211", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734855", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Sharding enhances blockchain scalability by dividing nodes into multiple shards to handle transactions in parallel. However, a size-security dilemma where every shard must be \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lai:2025:APH, author = "Eugenie Y. Lai and Yeye He and Surajit Chaudhuri", title = "{Auto-Prep}: Holistic Prediction of Data Preparation Steps for Self-Service Business Intelligence", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2212--2225", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734856", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Business Intelligence (BI) plays a critical role in empowering modern enterprises to make informed data-driven decisions, and has grown into a billion-dollar business. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Guerrmi:2025:TSM, author = "Valerio Guerrmi and Thibaut Germain and Charles Truong and Laurent Oudre and Paul Boniol", title = "Time Series Motif Discovery: a Comprehensive Evaluation", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2226--2239", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734857", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Motif Discovery involves identifying recurring patterns and locating their occurrences within a time series without prior knowledge about their shape or location. In practice, Motif \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bourgeois:2025:EDD, author = "Daniel Bourgeois and Zhimin Ding and Dimitrije Jankov and Jiehui Li and Mahmoud Sleem and Yuxin Tang and Jiawen Yao and Xinyu Yao and Chris Jermaine", title = "{EinDecomp}: Decomposition of Declaratively-Specified Machine Learning and Numerical Computations for Parallel Execution", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2240--2253", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734858", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We consider the problem of automatic parallelism in high-performance, tensor-based systems. Our focus is on intra-operator parallelism for inference tasks on a single GPU server or CPU \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2025:CLC, author = "Ruizhong Wu and Mengxuan Zhang and Shuxin Wang and Frodo Kin Sun Chan and Yan Nei Law and Lei Li", title = "Continuous Lifelong Conflict-Aware {AGV} Routing with Kinematic Constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2254--2267", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734859", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Automated Guided Vehicles (AGV) are becoming increasingly important in modern warehouses to cope with the enormous logistic demands of developing e-commerce and the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2025:WHE, author = "Dawei Liu and Bolong Zheng and Ziyang Yue and Fuhao Ruan and Xiaofang Zhou and Christian S. Jensen", title = "{Wolverine}: Highly Efficient Monotonic Search Path Repair for Graph-Based {ANN} Index Updates", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2268--2280", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734860", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Approximate nearest neighbor (ANN) search on high-dimensional vector data is core functionality in an increasing number of real-world applications. However, most \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Song:2025:DSR, author = "Jiansen Song and Wensheng Dou and Yingying Zheng and Yu Gao and Ziyu Cui and Wei Wang and Jun Wei", title = "Detecting Schema-Related Logic Bugs in Relational {DBMSs} via Equivalent Database Construction", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2281--2294", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734861", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Relational Database Management Systems (DBMSs) provide flexible DDL (Data Definition Language) statements that enable the creation, modification, and deletion of database \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kong:2025:GNM, author = "Weiyang Kong and Kaiqi Wu and Sen Zhang and Yubao Liu", title = "{GraphSparseNet}: a Novel Method for Large Scale Traffic Flow Prediction", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "7", pages = "2295--2307", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3734839.3734862", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:08 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Traffic flow forecasting is a critical spatio-temporal data mining task with wide-ranging applications in intelligent route planning and dynamic traffic management. Recent \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lai:2025:TFS, author = "Danling Lai and Jiajie Xu and Jianfeng Qu and Pingfu Chao and Junhua Fang and Chengfei Liu", title = "{TMLKD}: Few-Shot Trajectory Metric Learning via Knowledge Distillation", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2308--2320", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742729", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Trajectory metric learning, which supports the trajectory similarity search, is one of the most fundamental tasks in spatial-temporal data analysis. However, existing trajectory \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Nemoto:2025:ODG, author = "Jun Nemoto and Takashi Kambayashi and Takashi Hoshino and Hideyuki Kawashima", title = "{Oze}: Decentralized Graph-Based Concurrency Control for Long-Running Update Transactions", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2321--2333", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742730", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper proposes Oze, a concurrency control protocol that handles heterogeneous workloads, including long-running update transactions. Oze explores a large scheduling space using \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Milkai:2025:HSR, author = "Elena Milkai and Xiangyao Yu and Jignesh M. Patel", title = "{Hermes}: Off-the-Shelf Real-Time Transactional Analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2334--2347", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742731", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many modern applications require real-time analytics, where analytical processing (AP) workloads needs access to the latest data updates from a transactional \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2025:AFT, author = "Zeying Zhu and Jonathan Chamberlain and Kenny Wu and David Starobinski and Zaoxing Liu", title = "Approximation-First Timeseries Query At Scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2348--2361", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742732", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Timeseries monitoring systems such as Prometheus play a crucial role in gaining observability of the underlying system infrastructure. These systems collect \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2025:LFS, author = "Ziheng Wang and Junyu Wei and Alex Aiken and Guangyan Zhang and Jacob O. T{\o}rring and Rain Jiang and Chenyu Jiang and Wei Xu", title = "{LogCIoud}: Fast Search of Compressed Logs on Object Storage", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2362--2370", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742733", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large organizations emit terabytes of logs every day in their cloud environment. Efficient data science on these logs via text search is crucial for gleaning operational insights and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2025:WSP, author = "Changlun Li and Chenyu Yang and Yuyu Luo and Ju Fan and Nan Tang", title = "Weak-to-Strong Prompts with Lightweight-to-Powerful {LLMs} for High-Accuracy, Low-Cost, and Explainable Data Transformation", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2371--2384", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742734", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data transformation poses significant challenges due to the wide diversity in input data formats and different requirements. Existing approaches-including human-driven, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xie:2025:CAT, author = "Zhe Xie and Zeyan Li and Xiao He and Longlong Xu and Xidao Wen and Tieying Zhang and Jianjun Chen and Rui Shi and Dan Pei", title = "{ChatTS}: Aligning Time Series with {LLMs} via Synthetic Data for Enhanced Understanding and Reasoning", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2385--2398", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742735", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Understanding time series is crucial for its application in real-world scenarios. Recently, large language models (LLMs) have been increasingly applied to time series \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cormode:2025:FDD, author = "Graham Cormode and Daniel Ting", title = "Federated Data Distribution Shift Estimation", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2399--2412", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742736", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As data is increasingly held at the edge of the network, new methods are needed to perform analysis over distributed inputs. This has led to the emergence of the federated model \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bekkers:2025:IOA, author = "Liese Bekkers and Frank Neven and Stijn Vansummeren and Yisu Remy Wang", title = "Instance-Optimal Acyclic Join Processing Without Regret: Engineering the {Yannakakis} Algorithm in Column Stores", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2413--2426", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742737", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Acyclic join queries can be evaluated instance-optimally using Yannakakis' algorithm, which avoids needlessly large intermediate results through semi-join passes. Recent work \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Thiessen:2025:ALL, author = "Myles Thiessen and Guy Khazma and Sam Toueg and Eyal de Lara", title = "Asymmetric Linearizable Local Reads", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2427--2439", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742738", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many linearizable local read algorithms have been proposed to minimize the read latency of strongly consistent distributed databases deployed in geo-distributed networks. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2025:QVP, author = "Jinyang Liu and Pu Jiao and Kai Zhao and Xin Liang and Sheng Di and Franck Cappello", title = "{QPET}: a Versatile and Portable Quantity-of-Interest-Preservation Framework for Error-Bounded Lossy Compression", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2440--2453", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742739", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Error-bounded lossy compression has been widely adopted in many scientific domains because it can address the challenges in storing, transferring, and analyzing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2025:OUM, author = "Xinyi Zhu and Yongqi Zhang and Lei Chen", title = "{OpenMEL}: Unsupervised Multimodal Entity Linking Using Noise-Free Expanded Queries and Global Coherence", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2454--2467", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742740", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Multimodal Entity Linking (MEL), which involves disambiguating a mention composed of multimodal inputs to a multimodal knowledge base (KB), has gained increasing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ziehn:2025:UIW, author = "Ariane Ziehn and Jan Szlang and Steffen Zeuch and Volker Markl", title = "Unraveling the Impact of Window Semantics: Optimizing Join Order for Efficient Stream Processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2468--2481", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742741", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Window joins (WJs) are fundamental operators in stream processing systems (SPSs), enabling continuous, time-aware joins over unbounded data streams. Unlike time-agnostic \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zecchini:2025:DSD, author = "Luca Zecchini and Vasilis Efthymiou and Felix Naumann and Giovanni Simonini", title = "Deduplicated Sampling On-Demand", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2482--2495", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742742", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data practitioners often sample their datasets to produce representative subsets for their downstream tasks. When entities in a dataset can be partitioned into multiple groups, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jendal:2025:LGS, author = "Theis E. Jendal and Matteo Lissandrini and Peter Dolog and Katja Hose", title = "The Limits of Graph Samplers for Training Inductive Recommender Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2496--2504", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742743", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Inductive Recommender Systems are capable of recommending for new users and with new items thus avoiding the need to retrain after new data reaches the system. However, these \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liang:2025:HTM, author = "Zhiying Liang and Vahab Jabrayilov and Abutalib Aghayev and Aleksey Charapko", title = "{HoliPaxos}: Towards More Predictable Performance in State Machine Replication", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2505--2518", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742744", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "State machine replication (SMR) algorithms ensure redundancy in critical systems and, as a result, underpin fault-tolerant distributed databases. Good SMR protocol \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2025:DAC, author = "Peizhi Wu and Rong Kang and Tieying Zhang and Jianjun Chen and Ryan Marcus and Zachary G. Ives", title = "Data-Agnostic Cardinality Learning from Imperfect Workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2519--2532", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742745", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cardinality estimation (CardEst) is a critical aspect of query optimization. Traditionally, it leverages statistics built directly over the data. However, organizational policies (e.g., \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2025:BMI, author = "Song Wang and Chen Wang and Jianchun Wang and Shengguo Li and Rui Li and Zhiyong Peng", title = "{BLAEQ}: a Multigrid Index for Spatial Query on Geometry Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2533--2546", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742746", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The efficiency of spatial queries is pivotal for the analysis of geometry data in the fields such as computational simulation, point cloud processing and digital engineering. Utilizing the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cui:2025:STC, author = "Ziyu Cui and Wensheng Dou and Yu Gao and Rui Yang and Yingying Zheng and Jiansen Song and Yuan Feng and Jun Wei", title = "Simple Testing Can Expose Most Critical Transaction Bugs: Understanding and Detecting Write-Specific Serializability Violations in Database Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2547--2560", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742747", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database Management Systems (DBMSs) utilize transactions to guarantee data consistency and integrity. Incorrect implementations of transaction processing mechanisms \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sarpe:2025:EAE, author = "Ilie Sarpe and Aristides Gionis", title = "Efficient and Adaptive Estimation of Local Triadic Coefficients", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2561--2574", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742748", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Characterizing graph properties is fundamental to the analysis and to our understanding of real-world networked systems. The local clustering coefficient, and the more-recent, local \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2025:VTG, author = "Wentao Zhang and Jingyuan Wang and Yifan Yang and Leong Hou U", title = "{VecCity}: a Taxonomy-Guided Library for Map Entity Representation Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2575--2588", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742749", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Electronic maps consist of diverse entities, such as points of interest (POIs), road segments, and land parcels, playing a vital role in applications like ITS and LBS. Map entity \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mathew:2025:EME, author = "Jerin George Mathew and Donatella Firmani and Divesh Srivastava", title = "Evaluating Methods for Efficient Entity Count Estimation", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2589--2601", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742750", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The problem of estimating the size of a query result has a long history in data management. When the query performs entity resolution (aka record linkage or deduplication), the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cheng:2025:FTP, author = "Audrey Cheng and Aaron Kabcenell and Xiao Shi and Jolene Huey and Peter Bailis and Natacha Crooks and Ion Stoica", title = "Fair Transaction Processing for Multi-Tenant Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2602--2615", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742751", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Multi-tenant transactional databases frequently observe contention on shared data, leading to a need for performance isolation. Databases typically provide performance isolation \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2025:LMT, author = "Sheng Lin and Fangcheng Fu and Haoyang Li and Hao Ge and Xuanyu Wang and Jiawen Niu and Yaofeng Tu and Bin Cui", title = "{LobRA}: Multi-Tenant Fine-Tuning over Heterogeneous Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2616--2625", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742752", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the breakthrough of Transformer-based pre-trained models, the demand for fine-tuning (FT) to adapt the base pre-trained models to downstream applications continues to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kamali:2025:RPE, author = "Amin Kamali and Verena Kantere and Calisto Zuzarte and Vincent Corvinelli", title = "Robust Plan Evaluation Based on Approximate Probabilistic Machine Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2626--2638", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742753", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query optimizers in RDBMSs search for execution plans expected to be optimal for given queries. They use parameter estimates, often inaccurate, and make assumptions that may not \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fathollahzadeh:2025:CDC, author = "Saeed Fathollahzadeh and Essam Mansour and Matthias Boehm", title = "{CatDB}: Data-Catalog-Guided, {LLM}-Based Generation of Data-Centric {ML} Pipelines", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2639--2652", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742754", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data-centric machine learning (ML) pipelines extend traditional ML pipelines-of feature transformations, hyper-parameter tuning, and model training-by additional \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2025:CPV, author = "Hanwen Liu and Shashank Giridhara and Ibrahim Sabek", title = "Conformal Prediction for Verifiable Learned Query Optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2653--2666", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742755", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query optimization is critical in relational databases. Recently, numerous Learned Query Optimizers (LQOs) have been proposed, demonstrating superior performance \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Makhija:2025:ILP, author = "Neha Makhija and Wolfgang Gatterbauer", title = "Is Integer Linear Programming All You Need for Deletion Propagation? {A} Unified and Practical Approach for Generalized Deletion Propagation", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2667--2680", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742756", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Deletion Propagation (DP) refers to a family of database problems rooted in the classical view-update problem: how to propagate intended deletions in a view (query output) back \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2025:MCS, author = "Yurong Liu and Eduardo H. M. Pena and A{\'e}cio Santos and Eden Wu and Juliana Freire", title = "{Magneto}: Combining Small and Large Language Models for Schema Matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2681--2694", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742757", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent advances in language models (LMs) open new opportunities for schema matching (SM). Recent approaches have shown their potential and key limitations: while small LMs \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Guo:2025:EAS, author = "Qiuyu Guo and Jianye Yang and Wenjie Zhang and Hanchen Wang and Ying Zhang and Xuemin Lin", title = "Efficient and Accurate Subgraph Counting: a Bottom-up Flow-learning Based Approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2695--2708", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742758", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Subgraph counting is a fundamental problem in graph analytics with broad applications, yet remains computationally intractable due to its \#P-hardness. To address this, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2025:ARQ, author = "Lixiang Chen and Yuxing Han and Yu Chen and Xing Chen and Chengcheng Yang and Weining Qian", title = "{AQETuner}: Reliable Query-Level Configuration Tuning for Analytical Query Engines", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2709--2721", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742759", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 08:01:20 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern analytical query engines (AQEs) are essential for large-scale data analysis and processing. These systems usually provide numerous query-level tunable knobs that significantly affect individual query performance. While several studies have \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2025:PNZ, author = "Jun Liu and Bingqian Du and Ziyue Luo and Sitian Lu and Qiankun Zhang and Hai Jin", title = "{PipeTGL}: (Near) Zero Bubble Memory-Based Temporal Graph Neural Network Training via Pipeline Optimization", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2722--2734", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742760", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 08:01:20 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Memory-based Temporal Graph Neural Networks (M-TGNNs) demonstrate superior performance in dynamic graph learning tasks. Their success attributes to a memory module, which captures historical information for each node and implicitly creates a memory \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chung:2025:LCA, author = "Yeounoh Chung and Gaurav T. Kakkar and Yu Gan and Brenton Milne and Fatma {\"O}zcan", title = "Is Long Context All You Need? Leveraging {LLM}'s Extended Context for {NL2SQL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2735--2747", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742761", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 08:01:20 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large Language Models (LLMs) have demonstrated impressive capabilities across a range of natural language processing tasks. In particular, improvements in reasoning abilities and the expansion of context windows have opened new avenues for leveraging \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kuiper:2025:SPH, author = "Laurens Kuiper and Paul Gro{\ss} and Peter Boncz and Hannes M{\"u}hleisen", title = "Saving Private Hash Join", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "8", pages = "2748--2760", month = apr, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3742728.3742762", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 08:01:20 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern analytical database systems offer high-performance inmemory joins. However, if the build side of a join does not fit in RAM, performance degrades sharply due to switching to traditional external join algorithms such as sort-merge. In streaming \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2025:CCS, author = "Weixing Zhou and Yanfeng Zhang and Xinji Zhou and Zhiyou Wang and Zeshun Peng and Yang Ren and Sihao Li and Huanchen Zhang and Guoliang Li and Ge Yu", title = "Concurrency Control as a Service", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2761--2774", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746406", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Existing disaggregated databases separate execution and storage layers, enabling independent and elastic scaling of resources. In most cases, this design makes \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qiu:2025:TUB, author = "Xiangfei Qiu and Zhe Li and Wanghui Qiu and Shiyan Hu and Lekui Zhou and Xingjian Wu and Zhengyu Li and Chenjuan Guo and Aoying Zhou and Zhenli Sheng and Jilin Hu and Christian S. Jensen and Bin Yang", title = "{TAB}: Unified Benchmarking of Time Series Anomaly Detection Methods", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2775--2789", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746407", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Time series anomaly detection (TSAD) plays an important role in many domains such as finance, transportation, and healthcare. With the ongoing instrumentation of reality, more time \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhong:2025:HDT, author = "Yuchen Zhong and Junwei Su and Chuan Wu and Minjie Wang", title = "{Heta}: Distributed Training of Heterogeneous Graph Neural Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2790--2803", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746408", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Heterogeneous Graphs (HetGs) that capture relationships among different types of nodes are ubiquitous in real-world applications such as academic networks and e-commerce. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Foufoulas:2025:UBG, author = "Yannis Foufoulas and Theoni Palaiologou and Alkis Simitsis", title = "The {UDFBENCH} Benchmark for General-Purpose {UDF} Queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2804--2817", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746409", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "User-defined functions (UDFs) extend the expressiveness of declarative SQL with functional capabilities, but also pose a core bottleneck in query processing due to the impedance \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2025:EEK, author = "Ye Sun and Lei Shi and Yongxin Tong", title = "{eXpath}: Explaining Knowledge Graph Link Prediction with Ontological Closed Path Rules", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2818--2830", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746410", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Link prediction (LP) is crucial for Knowledge Graphs (KG) completion but commonly suffers from interpretability issues. While several methods have been proposed to explain \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Katsarakis:2025:LTL, author = "Antonios Katsarakis and Vasilis Gavrielatos and Emmanouil Giortamis and Pramod Bhatotia and Aleksandar Dragojevic and Boris Grot and Vijay Nagarajan and Panagiota Fatourou", title = "The {LAW} Theorem: Local Reads and Linearizable Asynchronous Replication", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2831--2845", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746411", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed datastores underpin highly concurrent, read-intensive applications, ensuring consistency, availability, and performance. They use crash-tolerant protocols to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Vandevoort:2025:URP, author = "Brecht Vandevoort and Alan Fekete and Bas Ketsman and Frank Neven and Stijn Vansummeren", title = "Using Read Promotion and Mixed Isolation Levels for Performant Yet Serializable Execution of Transaction Programs", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2846--2858", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746412", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We propose a theory that can determine the lowest isolation level that can be allocated to each transaction program in an application in a mixed-isolation-level setting, to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Korkmaz:2025:LAC, author = "Zeynep Korkmaz and M. Tamer {\"O}zsu and Khuzaima Daudjee", title = "Locality-Aware Cache Replacement Policy for Graph Traversals", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2859--2871", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746413", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many graph processing applications consist of read-only workloads that need to perform low-latency traversals over large graphs. These traversals are inherently expensive, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Adao:2025:KCD, author = "R{\'u}ben Ad{\~a}o and Zhongjie Wu and Changjun Zhou and Oana Balmau and Jo{\~a}o Paulo and Ricardo Macedo", title = "{KEIGO}: Co-Designing Log-Structured Merge Key-Value Stores with a Non-Volatile, Concurrency-Aware Storage Hierarchy", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2872--2885", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746414", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present Keigo, a concurrency- and workload-aware storage middleware that enhances the performance of log-structured merge key-value stores (LSM KVS) when they are \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2025:WLI, author = "Qiyu Liu and Siyuan Han and Yanlin Qi and Jingshu Peng and Jin Li and Longlong Lin and Lei Chen", title = "Why Are Learned Indexes So Effective but Sometimes Ineffective?", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2886--2898", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746415", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Learned indexes have attracted significant research interest due to their potential to offer better space-time trade-offs compared to B+-tree variants. Among various learned \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Khan:2025:SMS, author = "Falaah Arif Khan and Denys Herasymuk and Nazar Protsiv and Julia Stoyanovich", title = "Still More Shades of Null: an Evaluation Suite for Responsible Missing Value Imputation", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2899--2913", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746416", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data missingness is a practical challenge of sustained interest to the scientific community. In this paper, we present Shades-of-Null, an evaluation suite for responsible \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cong:2025:OPM, author = "Tianji Cong and Fatemeh Nargesian and Junjie Xing and H. V. Jagadish", title = "{OpenForge}: Probabilistic Metadata Integration", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2914--2927", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746417", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern data stores increasingly rely on metadata to enable diverse activities such as data cataloging and search. However, metadata curation remains a labor-intensive \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ahmad:2025:MPF, author = "Akhlaque Ahmad and Da Yan and Xiao Chen and Lyuheng Yuan and Qin Zhang and Saugat Adhikari", title = "Maximum $k$-Plex Finding: Choices of Pruning Techniques Matter!", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2928--2940", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746418", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "A k -plex is a dense subgraph structure where every vertex can be disconnected with at most k vertices. Finding a maximum k -plex (M k P) in a big graph is a key primitive in many \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gou:2025:CSE, author = "Xiaoxuan Gou and Weiguo Zheng and Yuxiang Wang and Xiaoliang Xu and Zhiyuan Yu", title = "A Comprehensive Survey and Experimental Study of Learning-Based Community Search", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2941--2954", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746419", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Given a graph G and a query node q, the goal of community search (CS) is to find a structurally cohesive subgraph from G that contains q. Significant progress has been made in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lu:2025:DAS, author = "Weizheng Lu and Chao Hui and Yunhai Wang and Feng Zhang and Yueguo Chen and Bao Liu and Chengjie Li and Zhaoxin Wu and Xuye Qin", title = "Decentralized Actor Scheduling and Reference-Based Storage in Xorbits: a Native Scalable Data Science Engine", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2955--2963", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746420", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data science pipelines consist of data preprocessing and transformation, and a typical pipeline comprises a series of operators, such as DataFrame filtering and groupby. As \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kong:2025:SES, author = "Tao Kong and Hui Li and Yuxuan Zhao and Liping Li and Xiyue Gao and Qilong Wu and Jiangtao Cui", title = "{STsCache}: an Efficient Semantic Caching Scheme for Time-Series Data Workloads Based on Hybrid Storage", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2964--2977", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746421", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Due to the increasing demand for extreme-scale time-series data workloads in data centers, it is required to build a high-performance semantic caching system that leverages the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2025:CCD, author = "Ruihong Wang and Jianguo Wang and Walid G. Aref", title = "Cache Coherence Over Disaggregated Memory", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2978--2991", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746422", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Disaggregating memory from compute offers the opportunity to better utilize stranded memory in cloud data centers. It is important to cache data in the compute nodes and maintain \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bhoot:2025:TSS, author = "Ruchi Bhoot and Tuhin Khare and Manoj Agarwal and Siddharth Jaiswal and Yogesh Simmhan", title = "{Triparts}: Scalable Streaming Graph Partitioning to Enhance Community Structure", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "2992--3006", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746423", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "k-way edge based partitioning algorithms for processing large streaming graphs, such as social networks and web crawls, assign each arriving edge to one of the k partitions. This \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qi:2025:LFB, author = "Shipeng Qi and Bing Tong and Jiatao Hu and Heng Lin and Yue Pang and Wei Yuan and Songlin Lyu and Zhihui Guo and Ke Huang and Xujin Ba and Qiang Yin and Youren Shen and Yan Zhou and Tao Lv and Jia Li and Lei Zou and Yongwei Wu and G{\'a}bor Sz{\'a}rnyas and Xiaowei Zhu and Wenguang Chen and Chuntao Hong", title = "The {LDBC} Financial Benchmark: Transaction Workload", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3007--3020", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746424", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph databases play a pivotal role in the FinTech industry. However, existing graph benchmarks fail to capture the unique characteristics of financial datasets and workloads, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sohn:2025:AQO, author = "Donghyun Sohn and Kelly Jiang and Nicolas Hammer and Jennie Rogers", title = "{Alchemy}: a Query Optimization Framework for Oblivious {SQL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3021--3034", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746425", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data sharing opportunities are everywhere, but privacy concerns and regulatory constraints often prevent organizations from fully realizing their value. A private data federation \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shankar:2025:DAQ, author = "Shreya Shankar and Tristan Chambers and Tarak Shah and Aditya G. Parameswaran and Eugene Wu", title = "{DocETL}: Agentic Query Rewriting and Evaluation for Complex Document Processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3035--3048", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746426", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Analyzing unstructured data has been a persistent challenge in data processing. Recent proposals offer declarative frameworks for LLM-powered processing of unstructured \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hu:2025:HSV, author = "Guoyu Hu and Shaofeng Cai and Tien Tuan Anh Dinh and Zhongle Xie and Cong Yue and Gang Chen and Beng Chin Ooi", title = "{HAKES}: Scalable Vector Database for Embedding Search Service", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3049--3062", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746427", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern deep learning models capture the semantics of complex data by transforming them into high-dimensional embedding vectors. Emerging applications, such as \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2025:PCC, author = "Zhengdong Wang and Qiang Yin and Longbin Lai", title = "Path-Centric Cardinality Estimation for Subgraph Matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3063--3076", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746428", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents PathCE, a path-centric cardinality estimation framework for subgraph matching. PathCE improves estimation accuracy by utilizing statistics from short graph \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lin:2025:CSS, author = "Hong Lin and Shixin Wan and Zhongle Xie and Ke Chen and Meihui Zhang and Lidan Shou and Gang Chen", title = "A Comprehensive Study of {Shapley} Value in Data Analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3077--3092", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746429", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Over the recent years, Shapley value (SV), a solution concept from cooperative game theory, has found numerous applications in data analytics (DA). This paper presents the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2025:EED, author = "Longjiao Zhang and Rui Wang and Tongya Zheng and Ziqi Huang and Wenjie Huang and Xinyu Wang and Can Wang and Mingli Song and Sai Wu and Shuibing He", title = "Effective and Efficient Distributed Temporal Graph Learning through Hotspot Memory Sharing", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3093--3105", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746430", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Memory-based temporal graph neural network (MTGNN) models are effective for predicting temporal graphs by using node memory and message-passing modules to capture \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Haque:2025:SSS, author = "Riddho R. Haque and Anh L. Mai and Matteo Brucato and Azza Abouzied and Peter J. Haas and Alexandra Meliou", title = "Stochastic {SketchRefine}: Scaling In-Database Decision-Making under Uncertainty to Millions of Tuples", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3106--3118", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746431", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Decision making under uncertainty often requires choosing packages, or bags of tuples, that collectively optimize expected outcomes while limiting risks. Processing Stochastic \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Weisgut:2025:CMP, author = "Marcel Weisgut and Daniel Ritter and Pinar T{\"o}z{\"u}n and Lawrence Benson and Tilmann Rabl", title = "{CXL} Memory Performance for In-Memory Data Processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3119--3133", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746432", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Compute Express Link (CXL) standard enables new forms of memory management and access across devices and servers. Based on PCIe, it enables cache-coherent access to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Teng:2025:LAL, author = "Fei Teng and Haoyang Li and Lei Chen", title = "{LLMLog}: Advanced Log Template Generation via {LLM-Driven} Multi-Round Annotation", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3134--3148", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746433", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern computing systems, such as HDFS and Spark, produce vast quantities of logs that developers use for tasks like anomaly detection and error analysis. To simplify log \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ngo:2025:CHK, author = "Vinh Quang Ngo and Marina Papatriantafilou", title = "Cuckoo Heavy Keeper and the Balancing Act of Maintaining Heavy Hitters in Stream Processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3149--3161", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746434", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Finding heavy hitters in databases and data streams is a fundamental problem with applications ranging from network monitoring to database query optimization, machine \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2025:RRC, author = "Qian Zhang and Yiwen Xiang and Jianhao Wei and Yang Yang and Yifan Li and Xueqing Gong and Wanggen Liu", title = "{Rebirth-Retire}: a Concurrency Control Protocol Adaptable to Different Levels of Contention", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3162--3174", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746435", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:09 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Wound-Retire concurrency control protocol was proposed to reduce contention for hotspots in in-memory databases. It enhances throughput under high-contention scenarios by \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2025:UMI, author = "Ruikun Li and Dai Shi and Ye Xiao and Junbin Gao", title = "{UFGTime}: Mining Intertwined Dependencies in Multivariate Time Series via an Efficient Pure Graph Approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3175--3188", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746436", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 08:04:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph Neural Networks (GNNs) have become a cornerstone in multivariate time series forecasting by addressing the challenge of modeling inter-series dependencies often overlooked by traditional temporal approaches. However, real-world temporal \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jiang:2025:AOH, author = "Ruochen Jiang and Spyros Blanas", title = "{ArrayMorph}: Optimizing Hyperslab Queries on the Cloud for Machine Learning Pipelines", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3189--3202", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746437", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 08:04:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cloud storage services such as Amazon S3, Azure Blob Storage, and Google Cloud Storage are widely used to store raw data for machine learning applications. When the data is later processed, the analysis predominantly focuses on regions of interest (such \ldots{})", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2025:IFG, author = "Yangxin Fan and Haolai Che and Yinghui Wu", title = "Inference-Friendly Graph Compression for Graph Neural Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3203--3215", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746438", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 08:04:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph Neural Networks (GNNs) have demonstrated promising performance in graph analysis. Nevertheless, the inference process of GNNs remains costly, hindering their applications for large graphs. This paper proposes inference-friendly graph compression \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Talluri:2025:GHP, author = "Sacheendra Talluri and Guido Walter {Di Donato} and Luca Danelutti and Koen R. Vlaswinkel and Marco Arnaboldi and Arnaud Delamare and Marco D. Santambrogio and Daniele Bonetta", title = "{GpJSON}: High-Performance {JSON} Data Processing on {GPUs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3216--3229", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746439", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 08:04:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The JavaScript Object Notation (JSON) format is ubiquitous, and countless applications depend on it to store and exchange high volumes of data. Despite its great popularity, JSON is nevertheless a very inefficient data format: decoding and querying JSON \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ferrara:2025:BSP, author = "Antonio Ferrara and David Garc{\'\i}a-Soriano and Francesco Bonchi", title = "Beyond Shortest Paths: Node Fairness in Route Recommendation", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "9", pages = "3230--3242", month = may, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3746405.3746440", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 08:04:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Traditionally, route recommendation systems focused on minimizing distance (or time) to travel between two points. However, recent attention has shifted to other factors beyond mere length. This paper addresses the challenge of ensuring a fair \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2025:ACI, author = "Yin Li and Sharad Mehrotra and Shantanu Sharma and Komal Kumari", title = "Access Control for Information-Theoretically Secure Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3243--3255", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748192", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper presents a novel key-based access control technique for secure outsourcing key-value stores where values correspond to documents that are indexed and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Peng:2025:DRF, author = "Zhencan Peng and Miao Qiao and Wenchao Zhou and Feifei Li and Dong Deng", title = "Dynamic Range-Filtering Approximate Nearest Neighbor Search", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3256--3268", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748193", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Range-filtering approximate nearest neighbor search (RFANNS) has gained significant attention recently. Consider a set D of high-dimensional vectors, each associated with a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cao:2025:LGM, author = "Yukun Cao and Zengyi Gao and Zhiyang Li and Xike Xie and S. Kevin Zhou and Jianliang Xu", title = "{LEGO-GraphRAG}: Modularizing Graph-Based Retrieval-Augmented Generation for Design Space Exploration", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3269--3283", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748194", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "GraphRAG integrates (knowledge) graphs with large language models (LLMs) to improve reasoning accuracy and contextual relevance. Despite its promising applications and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hwang:2025:DVE, author = "Jinwoo Hwang and Daeun Kim and Sangyeop Lee and Yoonsung Kim and Guseul Heo and Hojoon Kim and Yunseok Jeong and Tadiwos Meaza and Eunhyeok Park and Jeongseob Ahn and Jongse Park", title = "{D{\'e}j{\`a} Vu}: Efficient Video-Language Query Engine with Learning-Based Inter-Frame Computation Reuse", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3284--3298", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748195", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recently, Video-Language Models (VideoLMs) have demonstrated remarkable capabilities, offering significant potential for flexible and powerful video query systems. These \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Stoian:2025:PSP, author = "Mihail Stoian and Andreas Zimmerer and Skander Krid and Amadou Latyr Ngom and Jialin Ding and Tim Kraska and Andreas Kipf", title = "{Parachute}: Single-Pass Bi-Directional Information Passing", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3299--3311", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748196", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Sideways information passing is a well-known technique for mitigating the impact of large build sides in a database query plan. As currently implemented in production systems, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2025:TTC, author = "Jiani Yang and Sai Wu and Yong Wang and Dongxiang Zhang and Yifei Liu and Xiu Tang and Gang Chen", title = "Twisted Twin: a Collaborative and Competitive Memory Management Approach in {HTAP} Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3312--3325", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748197", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Many GaussDB customers, particularly small and medium-sized enterprises (SMEs), require high transaction throughput with occasional analytical queries. HTAP systems that deploy \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Farhan:2025:CMH, author = "Muhammad Farhan and Henning Koehler and Qing Wang and Jiawen Wang and Moritz Laupichler and Peter Sanders", title = "Customization Meets 2-Hop Labeling: Efficient Routing in Road Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3326--3338", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748198", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Efficient route planning is crucial for modern navigation systems, yet traditional methods face challenges in scenarios with unknown or frequently changing traffic dynamics. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2025:XBM, author = "Dayi Fan and Rubao Lee and Xiaodong Zhang", title = "{X-Blossom}: Massive Parallelization of Graph Maximum Matching", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3339--3353", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748199", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The blossom algorithm computes maximum matchings in graphs and has been widely applied across diverse domains, including machine learning, economic analysis, and other \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2025:DIL, author = "Chenyu Yang and Yuyu Luo and Chuanxuan Cui and Ju Fan and Chengliang Chai and Nan Tang", title = "Data Imputation with Limited Data Redundancy Using Data Lakes", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3354--3367", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748200", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data imputation is essential for many data science applications. Existing methods rely heavily on sufficient data redundancy from within-table values. However, many \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2025:CMO, author = "Chunyue Huang and Shuang Liu and Xinyi Zhang and Wenhao Li and Wei Lu and Xiaoyong Du", title = "{Chimera}: Mitigating Ownership Transfers in Multi-Primary Shared-Storage Cloud-Native Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3368--3381", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748201", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cloud-native database systems with multi-primary shared-storage architecture have emerged due to their superior performance over primary-secondary architecture on \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xu:2025:STA, author = "Minze Xu and Zhentai Xie and Zhibin Wang and Guangzhan Wang and Longbin Lai and Yuan Zhang and Chen Tian and Sheng Zhong", title = "{Sectric}: Towards Accurate, Privacy-Preserving and Efficient Triangle Counting", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3382--3395", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748202", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph data analysis, particularly local triangle counting, plays a pivotal role in deciphering complex relationships within graph data. This method is invaluable across diverse \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2025:WSM, author = "Haoyang Li and Yuming Xu and Yiming Li and Hanmo Liu and Darian Li and Chen Jason Zhang and Lei Chen and Qing Li", title = "When Speed Meets Accuracy: an Efficient and Effective Graph Model for Temporal Link Prediction", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3396--3405", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748203", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Temporal link prediction in dynamic graphs is a critical task with applications in diverse domains such as social networks, recommendation systems, and e-commerce platforms. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tang:2025:ITS, author = "Yuxin Tang and Feng Zhang and Jiawei Guan and Yuan Tian and Xiangdong Huang and Chen Wang and Jianmin Wang and Xiaoyong Du", title = "Improving Time Series Data Compression in {Apache IoTDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3406--3420", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748204", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Time series data are generated on an unprecedented scale across various domains. Although traditional compression techniques reduce storage costs, they typically \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2025:LEM, author = "Jianwei Wang and Kai Wang and Ying Zhang and Wenjie Zhang and Xiwei Xu and Xuemin Lin", title = "On {LLM}-Enhanced Mixed-Type Data Imputation with High-Order Message Passing", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3421--3434", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748205", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Missing data imputation, which aims to impute the missing values in the raw datasets, is crucial for modern data-driven models like large language models (LLMs). Despite its \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chakraborty:2025:MDE, author = "Vishal Chakraborty and Youri Kaminsky and Sharad Mehrotra and Felix Naumann and Faisal Nawab and Primal Pappachan and Mohammad Sadoghi and Nalini Venkatasubramanian", title = "Meaningful Data Erasure in the Presence of Dependencies", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3435--3448", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748206", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data regulations like GDPR require systems to support data erasure but leave the definition of ``erasure'' open to interpretation. This ambiguity makes compliance challenging, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tang:2025:SMD, author = "Chuzhe Tang and Zhaoguo Wang and Jinyang Li and Haibo Chen", title = "{Sonata}: Multi-Database Transactions Made Fast and Serializable", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3449--3462", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748207", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Today, the wide adoption of distributed service-oriented applications has rendered multi-database transactions increasingly important. They protect cross-service workflows \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ceccarello:2025:MSM, author = "Matteo Ceccarello and Francesco Pio Monaco and Francesco Silvestri", title = "{MOMENTI}: Scalable Motif Mining in Multidimensional Time Series", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3463--3476", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748208", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Time series play a fundamental role in many domains, capturing a plethora of information about the underlying data-generating processes. When a process generates multiple synchronized", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Martin:2025:HWF, author = "Albert Martin and Eduardo C. de Almeida and Oscar Romero and Anna Queralt", title = "How and Why False Denial Constraints are Discovered", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3477--3489", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748209", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Denial Constraints (DCs) are a flexible formalism to express many types of data rules, making them a widely adopted tool for many applications. This flexibility led to the development \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2025:ECD, author = "Yingli Zhou and Qingshuo Guo and Yixiang Fang", title = "Efficient $k$-Clique Densest Subgraph Discovery: Towards Bridging Practice and Theory", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3490--3503", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748210", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Densest subgraph discovery (DSD) is a fundamental topic in graph mining. It has been studied for decades, and is widely used in various areas, including network science, biological \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2025:ANL, author = "Meihao Fan and Ju Fan and Nan Tang and Lei Cao and Guoliang Li and Xiaoyong Du", title = "{AutoPrep}: Natural Language Question-Aware Data Preparation with a Multi-Agent Framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3504--3517", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748211", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Answering natural language (NL) questions about tables, known as Tabular Question Answering (TQA), is crucial because it allows users to quickly and efficiently extract \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gong:2025:AAN, author = "Zengyang Gong and Yuxiang Zeng and Lei Chen", title = "Accelerating Approximate Nearest Neighbor Search in Hierarchical Graphs: Efficient Level Navigation with Shortcuts", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3518--3530", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748212", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Approximate Nearest Neighbor (ANN) search is a foundational yet computationally demanding query in vector databases, critical for applications such as information retrieval and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2025:FIT, author = "Yan Zhang and Shuwei Liang and Xiaoye Miao and Yangyang Wu and Jianwei Yin", title = "Federated Incomplete Tabular Data Prediction with Missing Complementarity", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3531--3544", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748213", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Tabular data is abundant and crucial across both industry and academia. Federated learning (FL) offers a promising solution for the analysis of tabular data distributed across multiple \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hu:2025:LTS, author = "Yihao Hu and Jin Wang and Sajjadur Rahman", title = "{LakeVisage}: Towards Scalable, Flexible and Interactive Visualization Recommendation for Data Discovery over Data Lakes", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3545--3558", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748214", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data discovery from data lakes is an essential application in modern data science. While many previous studies focused on improving the efficiency and effectiveness of data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2025:PMA, author = "Xiaokai Zhou and Xiao Yan and Fangcheng Fu and Ziwen Fu and Tieyun Qian and Yuanyuan Zhu and Qinbo Zhang and Bin Cui and Jiawei Jiang", title = "{PS-MI}: Accurate, Efficient, and Private Data Valuation in Vertical Federated Learning", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3559--3572", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748215", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Vertical federated learning (VFL) trains models when multiple databases (a.k.a participants) hold different features of the same set of samples. By quantifying each participant's \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2025:TPA, author = "Jiasheng Zhang and Deqiang Ouyang and Shuang Liang and Jie Shao", title = "Towards Pattern-Aware Data Augmentation for Temporal Knowledge Graph Completion", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3573--3586", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748216", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Predicting missing facts for temporal knowledge graphs (TKGs) is a fundamental task, called temporal knowledge graph completion (TKGC). One key challenge in this task is the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hao:2025:RED, author = "Chiyu Hao and Jixian Su and Shixuan Sun and Hao Zhang and Sen Gao and Jianwen Zhao and Chenyi Zhang and Jieru Zhao and Chen Chen and Minyi Guo", title = "{RapidStore}: an Efficient Dynamic Graph Storage System for Concurrent Queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3587--3600", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748217", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Dynamic graph storage systems are essential for real-time applications such as social networks and recommendation, where the graph continuously evolves. However, they \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2025:GGO, author = "Xiaoyu Fan and Kun Chen and Jiping Yu and Xiaowei Zhu and Yunyi Chen and Huanchen Zhang and Wei Xu", title = "{GORAM}: Graph-Oriented {ORAM} for Efficient Ego-Centric Queries on Federated Graphs", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3601--3614", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748218", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Ego-centric queries, focusing on a target vertex and its direct neighbors, are essential for various applications. Enabling such queries on graphs owned by mutually distrustful data \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2025:AAQ, author = "Weijie Sun and Zihuan Xu and Wangze Ni and Lei Chen and Peng Cheng and Chen Jason Zhang", title = "Authenticated Aggregate Queries with {Boolean} Range Predicates on Blockchains", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3615--3627", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748219", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Blockchains have gained wide adoption for secure data processing. As blockchain data volumes grow, the demand for efficient data analysis, especially aggregate queries, becomes \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chao:2025:FFI, author = "Zemin Chao and Qiaoyi Zheng and Zhixin Qi and Hongzhi Wang", title = "{FSMDTW}: a Fast Index-Free Subsequence Matching Algorithm for Dynamic Time Warping", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3628--3640", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748220", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The subsequence matching problem utilizing dynamic time warping as the similarity measurement has been recognized as a key operation in time series analysis for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tang:2025:FGW, author = "Jianheng Tang and Xi Zhao and Lemin Kong and Xiaofang Zhou and Jia Li", title = "Fused {Gromov-Wasserstein} Alignment for Graph Edit Distance Computation and Beyond", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3641--3654", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748221", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:11 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph Edit Distance (GED) is a widely recognized metric for measuring graph similarity, yet its NP-complete nature poses challenges for fast and accurate computation. This paper \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2025:ETT, author = "Tianshu Zhang and Kun Qian and Siddhartha Sahai and Yuan Tian and Shaddy Garg and Huan Sun and Yunyao Li", title = "{Evoschema}: Towards Text-to-{SQL} Robustness against Schema Evolution", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3655--3668", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748222", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 08:07:36 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Neural text-to-SQL models, which translate natural language questions (NLQs) into SQL queries given a database schema, have achieved remarkable performance. However, database schemas frequently evolve to meet new requirements. Such schema evolution often \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Han:2025:EEC, author = "Shuai Han and Yushi Tao and Jingwen Tan and Huanran Wang and Wu Yang and Yanmei Wang", title = "Effective and Efficient Community Search for Complex Network Semantics Capture: From Coarse-Grain to Fine-Grain", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3669--3681", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748223", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 08:07:36 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To analyze the massive social networks for providing personalized services, community search is widely studied to find the densely connected subgraph that can reflect the network properties for a given query. The existing community search methods adopt \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2025:HWD, author = "Rongrong Zhang and Zhiwei Ye and Jun-Peng Zhu and Peng Cai and Xuan Zhou and Dunbo Cai and Ling Qian", title = "{HAWK}: a Workload-Driven Hierarchical Deadlock Detection Approach in Distributed Database System", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "10", pages = "3682--3694", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3748191.3748224", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 08:07:36 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Distributed databases are widely used in various fields, such as financial services and e-commerce. These businesses generally exhibit characteristics of large-scale and rapid growth. However, these business systems often suffer from deadlocks that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chai:2025:DBA, author = "Chengliang Chai and Jiajun Li and Yuhao Deng and Yuanhao Zhong and Ye Yuan and Guoren Wang and Lei Cao", title = "{Doctopus}: Budget-Aware Structural Table Extraction from Unstructured Documents", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3695--3707", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749647", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "To fulfill the potential great value of unstructured documents, it is critical to extract structural data (e.g., attributes) from them, which can benefit various applications \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wen:2025:ESS, author = "Qi Wen and Yutong Ye and Xiang Lian and Mingsong Chen", title = "{S$3$AND}: Efficient Subgraph Similarity Search under Aggregated Neighbor Difference Semantics", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3708--3720", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749648", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "For the past decades, the subgraph similarity search over a large-scale data graph has become increasingly important and crucial in many real-world applications, such as social \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zheng:2025:LXE, author = "Yanping Zheng and Zhewei Wei and Frank de Hoog and Xu Chen and Hongteng Xu and Yuhang Ye and Jiadeng Huang", title = "{Lighter-X}: an Efficient and Plug-and-Play Strategy for Graph-Based Recommendation through Decoupled Propagation", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3721--3729", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749649", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph Neural Networks (GNNs) have demonstrated remarkable effectiveness in recommendation systems. However, conventional graph-based recommenders, such as \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2025:SES, author = "Qiyu Liu and Yanlin Qi and Siyuan Han and Jingshu Peng and Jin Li and Lei Chen", title = "Not Small Enough? {SegPQ}: a Learned Approach to Compress Product Quantization Codebooks", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3730--3743", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749650", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The rapid advancements of generative artificial intelligence (GenAI) have recently led to renewed attention towards approximate nearest neighbor (ANN) search and vector databases \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rashedi:2025:ACE, author = "Nazanin Rashedi and Guido Moerkotte", title = "The Accuracy of Cardinality Estimators: Unraveling the Evaluation Result Conundrum", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3744--3756", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749651", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Existing research on the accuracy of cardinality estimators generally suffers from a lack of diversity and sufficient quantity of their experimental datasets, particularly in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tang:2025:LLP, author = "Benzhao Tang and Shiyu Yang and Zhitao Shen and Wenjie Zhang and Xuemin Lin and Zhihong Tian", title = "{LogLite}: Lightweight Plug-and-Play Streaming Log Compression", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3757--3770", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749652", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Log data is a vital resource for capturing system events and states. With the increasing complexity and widespread adoption of modern software systems and IoT devices, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tsegai:2025:EEA, author = "Saimon Amanuel Tsegai and Xinyu Yang and Haoyuan Liu and Peng Gao", title = "Enabling Efficient Attack Investigation via Human-in-the-Loop Security Analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3771--3783", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749653", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "System auditing is a vital technique for collecting system call events as system provenance and investigating complex multi-step attacks such as Advanced Persistent Threats. However, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fu:2025:STI, author = "Wenzhi Fu and Yang Cao", title = "Shifting Transaction Isolation on Graphs: From Systems to Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3784--3796", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749654", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Processing long-running read-write transactions on graphs is an open challenge, primarily due to the need for serializability to maintain basic structural consistency of graphs. We \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jiang:2025:FGV, author = "Wenqi Jiang and Hang Hu and Torsten Hoefler and Gustavo Alonso", title = "Fast Graph Vector Search via Hardware Acceleration and Delayed-Synchronization Traversal", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3797--3811", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749655", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Vector search systems are indispensable in large language model (LLM) serving, search engines, and recommender systems, where minimizing online search latency is essential. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ye:2025:FLE, author = "Hengyu Ye and Jiadong Chen and Fuxin Jiang and Xiao He and Tieying Zhang and Jianjun Chen and Xiaofeng Gao", title = "{Fremer}: Lightweight and Effective Frequency Transformer for Workload Forecasting in Cloud Services", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3812--3825", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749656", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Workload forecasting is pivotal in cloud service applications, such as auto-scaling and scheduling, with profound implications for operational efficiency. Although \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Nobari:2025:TLL, author = "Arash Dargahi Nobari and Davood Rafiei", title = "{TabulaX}: Leveraging Large Language Models for Multi-Class Table Transformations", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3826--3839", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749657", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The integration of tabular data from diverse sources is often hindered by inconsistencies in formatting and representation, posing significant challenges for data analysts and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cui:2025:BLT, author = "Fan Cui and Eric Lo and Srijan Srivastava and Ziliang Lai", title = "{Bonspiel}: Low Tail Latency Transactions in Geo-Distributed Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3840--3853", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749658", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Tail latency is crucial as it impacts user satisfaction and service-level objectives (SLOs). However, geo-distributed databases have long struggled with this issue due to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2025:EGD, author = "Qiange Wang and Yongze Yan and Hongshi Tan and Cheng Chen and Cheng Zhao and Jiaming Tian and Jiaxin Jiang and Xiaoliang Cong and Yanfeng Zhang and Ge Yu and Weng-Fai Wong and Bingsheng He", title = "Efficient Graph Data Access for Out-of-Memory {GPU} Streaming Graph Processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3854--3867", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749659", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Leveraging GPUs' high parallelism can significantly improve the real-time computation efficiency of streaming graph processing. However, when a large-scale graph \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schmitt:2025:ERE, author = "Daniel Schmitt and Thomas H{\"u}tter and Nikolaus Augsten", title = "Extensible and Robust Evaluation of Similarity Queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3868--3882", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749660", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We study the similarity join problem from a systems perspective. A similarity join retrieves all similar record pairs from two collections based on a given distance function. Existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2025:PWS, author = "Yan Zhou and Chunwei Liu and Bhuvan Urgaonkar and Zhengle Wang and Magnus Mueller and Chao Zhang and Songyue Zhang and Pascal Pfeil and Dominik Horn and Zhengchun Liu and Davide Pagano and Tim Kraska and Samuel Madden and Ju Fan", title = "{PBench}: Workload Synthesizer with Real Statistics for Cloud Analytics Benchmarking", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3883--3895", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749661", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cloud service providers commonly use standard benchmarks like TPC-H and TPC-DS to evaluate and optimize cloud data analytics systems. However, these benchmarks rely on \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lu:2025:ASM, author = "Yujie Lu and Zhijie Zhang and Weiguo Zheng and Lei Zou", title = "Accelerating Subgraph Matching through Fine-Grained and Powerful Equivalences", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3896--3909", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749662", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Subgraph matching, a cornerstone of graph analytics, critically suffers from redundant computations during the search process. Existing methods primarily target identical \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gretscher:2025:HOS, author = "Luca Gretscher and Jens Dittrich", title = "How to Optimize {SQL} Queries? {A} Comparison Between Split, Holistic, and Hybrid Approaches", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3910--3922", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749663", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Relational database systems internally construct a physical query execution plan (QEP) that specifies exactly how to compute a desired result. However, choosing a QEP involves \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Eslami:2025:DDR, author = "Navid Eslami and Ioana O. Bercea and Niv Dayan", title = "{Diva}: Dynamic Range Filter for Var-Length Keys and Queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3923--3936", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749664", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Range filters are compact probabilistic data structures that answer approximate range emptiness queries. They are used in many domains, e.g., in key-value stores, to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Becchetti:2025:AHN, author = "Luca Becchetti and Andrea Clementi and Luciano Gual{\'a} and Luca Pep{\`e} Sciarria and Alessandro Straziota and Matteo Stromieri", title = "Approximate 2-hop Neighborhoods on Incremental Graphs: an Efficient Lazy Approach", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3937--3950", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749665", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this work, we propose, analyze and empirically validate a lazy-update approach to maintain accurate approximations of the 2-hop neighborhoods of dynamic graphs \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mageirakos:2025:CVS, author = "Vasilis Mageirakos and Bowen Wu and Gustavo Alonso", title = "Cracking Vector Search Indexes", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3951--3964", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749666", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Retrieval Augmented Generation (RAG) uses vector databases to expand the expertise of an LLM model without having to retrain it. The idea can be applied over data lakes, leading \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Heidari:2025:DDO, author = "Alireza Heidari and Amirhossein Ahmadi and Wei Zhang", title = "{DobLIX}: a Dual-Objective Learned Index for Log-Structured Merge Trees", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3965--3978", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749667", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we introduce DobLIX, a dual-objective learned index (LI) specifically designed for Log-Structured Merge (LSM) tree-based key-value stores. Traditional LIs primarily \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2025:CMC, author = "Xuhang Zhu and Xiu Tang and Sai Wu and Jichen Li and Haobo Wang and Chang Yao and Quanqing Xu and Gang Chen", title = "{CoLA}: Model Collaboration for Log-Based Anomaly Detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3979--3987", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749668", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Log-based anomaly detection plays a crucial role in ensuring the reliability of systems. While deep learning-based small detection models (SDMs) are efficient, the large language models \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jungmair:2025:TDF, author = "Michael Jungmair and Jana Giceva", title = "Towards Designing Future-Proof Data Processing Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3988--3995", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749669", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data processing systems find themselves crushed between two moving tectonic plates: the usage plate driven by the system's users and their requirements; and the environment \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Abramovich:2025:AFA, author = "Omer Abramovich and Daniel Deutch and Nave Frost and Ahmet Kara and Dan Olteanu", title = "Advancing Fact Attribution for Query Answering: Aggregate Queries and Novel Algorithms", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "3996--4008", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749670", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we introduce a novel approach to computing the contribution of input tuples to the result of the query, quantified by the Banzhaf and Shapley values. In contrast to prior \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pachera:2025:WIC, author = "Amedeo Pachera and Mattia Palmiotto and Angela Bonifati and Andrea Mauri", title = "What If: Causal Analysis with Graph Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "4009--4016", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749671", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graphs are powerful abstractions for modeling relationships and enabling data science tasks. In causal inference, Directed Acyclic Graphs (DAGs) serve as a key formalism, but they are \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gienieczko:2025:AFS, author = "Mateusz Gienieczko and Maximilian Kuschewski and Thomas Neumann and Viktor Leis and Jana Giceva", title = "{AnyBlox}: a Framework for Self-Decoding Datasets", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "4017--4031", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749672", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Research advancements in storage formats continuously produce more efficient encodings and better compression rates. Despite this, new formats are not adopted due to high \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ji:2025:FBC, author = "Yushuai Ji and Shengkun Zhu and Shixun Huang and Zepeng Liu and Sheng Wang and Zhiyong Peng", title = "Federated and Balanced Clustering for High-Dimensional Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "4032--4044", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749673", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Balanced k -means ensures representative centroids by forming equal-sized clusters, but struggles with slow clustering of massive distributed attributes and data-sharing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hafidi:2025:RDM, author = "Mohamed Sabri Hafidi and Ozan Kahramano{\u{g}}ullar{\i} and Anton Dign{\"o}s and Johann Gamper", title = "Relational Data Models for Genetic {VCF} data", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "4045--4053", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749674", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The Variant Call Format (VCF) and its binary counterpart (BCF) are commonly used in bioinformatics for storing gene sequence data. While VCF files provide compact \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Giannoulidis:2025:BRC, author = "Apostolos Giannoulidis and Anastasios Gounaris and John Paparrizos", title = "{BURST}: Rendering Clustering Techniques Suitable for Evolving Streams", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "4054--4063", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749675", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Identifying patterns or clusters in streaming time-series data is crucial for decision-making, and underpins applications such as anomaly detection, forecasting, and data quality \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bachras:2025:EFQ, author = "Michail Bachras and Hans-Arno Jacobsen", title = "Environmental Footprints of Query Processing: a Vision for Sustainable Database Architectures", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "11", pages = "4064--4072", month = jul, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3749646.3749676", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Thu Oct 2 15:22:12 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Database systems underpin modern computing infrastructure, yet their environmental impact remains a significant blind spot in both industry and research. As data volumes grow \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2025:LMF, author = "Fengxin Li and Yi Li and Yue Liu and Chao Zhou and Yuan Wang and Xiaoxiang Deng and Wei Xue and Dapeng Liu and Lei Xiao and Haijie Gu and Jie Jiang and Hongyan Liu and Biao Qin and Jun He", title = "{LEADRE}: Multi-Faceted Knowledge Enhanced {LLM} Empowered Display Advertisement Recommender System", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4763--4776", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750602", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Display advertising plays a crucial role in benefiting advertisers, publishers, and users. Traditional display advertising systems employ a multi-stage architecture comprising retrieval, coarse ranking, ranking, and re-ranking. However, conventional \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Theodorakis:2025:TEG, author = "Georgios Theodorakis and Hugo Firth and James Clarkson and Natacha Crooks and Jim Webber", title = "{TuskFlow}: an Efficient Graph Database for Long-Running Transactions", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4777--4790", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750603", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Mammoth transactions, which involve long-running operations that access many items, are common in graph workloads. Graph analytics tasks, including pattern matching and graph algorithms, can generate large read-write operations that impact significant \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Antonopoulos:2025:MMM, author = "Panagiotis Antonopoulos and Mansi Chauhan and Shailender Dabas and Rajat Jain and Darshan Kattera and Wonseok Kim and Hanuma Kodavalla and Nikolas Ogg and Prashanth Purnananda and Rahul Ranjan and Alex Swanson and Divyesh Tikmani", title = "{MD-MVCC}: Multi-Version Concurrency Control for Schema Changes in {Azure SQL} Database", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4791--4803", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750605", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As applications and data evolve over time, the database schema must be adjusted to accommodate their needs. Schema changes in relational databases have traditionally required synchronization with concurrent read and write access, causing significant \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Carey:2025:TPP, author = "Michael Carey and Wail Alkowaileet and Nick DiGeronimo and Peeyush Gupta and Sachin Smotra and Till Westmann", title = "Towards Principled, Practical Document Database Design", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4804--4816", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750606", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Relational database design is a well-understood process enabled by a combination of database theory (e.g., normal forms) as well as conceptual modeling (e.g., ER-based design). In contrast, database design for NoSQL databases, notably document databases, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Karpathiotakis:2025:SHM, author = "Manos Karpathiotakis and Vlassios Rizopoulos and Basri Kahveci and Tiziano Carotti and Artem Gelum and Hazem Nada and Yuri Dolgov", title = "{Scribe}: How {Meta} Transports Terabytes per Second in Real Time", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4817--4830", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750607", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Millions of web servers and a multitude of applications are producing ever-increasing amounts of data in real time at Meta. Regardless of how data is generated and how it is processed, there is a need for infrastructure that can accommodate the transport \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ritter:2025:HNQ, author = "Daniel Ritter and Mihnea Andrei and Sukhyeun Cho and Maik G{\"o}rgens and Taehyung Lee and Norman May and Amit Pathak and Paul R. Willems", title = "The {HANA} Native Query Engine for Lakehouse Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4831--4845", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750608", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern enterprise applications and data warehouse systems move data into data lakes for economical and scalability reasons. Data is then stored in popular columnar file formats like Parquet which are optimized for writing using open table formats like \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mei:2025:DSM, author = "Yuan Mei and Rui Xia and Zhaoqian Lan and Kaitian Hu and Lei Huang and Paris Carbone and Yanfei Lei and Vasiliki Kalavri and Han Yin and Feng Wang", title = "Disaggregated State Management in {Apache Flink\reg{} 2.0}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4846--4859", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750609", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present Apache Flink 2.0, an evolution of the popular stream processing system's architecture that decouples computation from state management. Flink 2.0 relies on a remote distributed file system (DFS) for primary state storage and uses local disks \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jiang:2025:SCL, author = "Jie Jiang and Haining Xie and Siqi Shen and Yu Shen and Zihan Zhang and Meng Lei and Yifeng Zheng and Yang Li and Chunyou Li and Danqing Huang and Yinjun Wu and Wentao Zhang and Bin Cui and Peng Chen", title = "{SiriusBI}: a Comprehensive {LLM}-Powered Solution for Data Analytics in Business Intelligence", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4860--4873", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750610", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "With the proliferation of Large Language Models (LLMs) in Business Intelligence (BI), existing solutions face critical challenges in industrial deployments: functionality deficiencies from legacy systems failing to meet evolving LLM-era user demands, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chang:2025:SCM, author = "Edward Y. Chang and Longling Geng", title = "{SagaLLM}: Context Management, Validation, and Transaction Guarantees for Multi-Agent {LLM} Planning", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4874--4886", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750611", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This paper introduces SagaLLM, a structured multi-agent architecture designed to address four foundational limitations of current LLM-based planning systems: unreliable self-validation, context loss, lack of transactional safeguards, and insufficient \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2025:UPC, author = "Hongtao Yang and Zhichen Xu and Sergey Yudin and Andrew Davidson", title = "Unlocking the Power of {CI\slash CD} for Data Pipelines in Distributed Data Warehouses", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4887--4895", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750613", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Ensuring the reliability of data pipelines is critical for modern data-driven organizations, yet building robust Continuous Integration (CI) in large, distributed data warehouses remains a significant challenge. Complexities arising from distributed \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2025:VHH, author = "Jianjun Chen and Li Zhang and Yu Xie and Wei Ding and Lixun Cao and Ye Liu and Yonghua Ding and Fangshi Li and Ke Wu and Haibo Xiu and Kui Wei and Le Cai and Rui Chang and Yuxiang Chen and Yuanjin Lin and Shangyu Luo and Jianfeng Qian and Xu Wang and Zikang Wang and Jian Zhang and Mingyi Zhang and Shicai Zeng and Jason Sun and Lei Zhang and Rui Shi and Pengwei Zhao", title = "{veDB-HTAP}: a Highly Integrated, Efficient and Adaptive {HTAP} System", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4896--4909", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750614", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In this paper, we describe veDB-HTAP, a highly integrated, efficient, and adaptive HTAP system recently built in ByteDance. veDB-HTAP adopts a highly integrated system architecture by leveraging the Secondary Engine mechanism provided by MySQL and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kanellis:2025:FFE, author = "Konstantinos Kanellis and Badrish Chandramouli and Ted Hart and Shivaram Venkataraman", title = "From {FASTER} to {F2}: Evolving Concurrent Key--Value Store Designs for Large Skewed Workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4910--4923", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750615", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern large-scale services such as search engines, messaging platforms, and serverless functions, rely on key-value (KV) stores to maintain high performance at scale. When such services are deployed in constrained memory environments, they present \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chakkappen:2025:AIO, author = "Sunil Chakkappen and Shreya Kunjibettu and Daniel McGreer and Masoomeh Javidi Kishi and Hong Su and Mohamed Ziauddin and Mohamed Zait and Zhan Li and Yuying Zhang", title = "Automatic Indexing in {Oracle}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4924--4937", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750616", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Indexes are one of the important access structures that help improve database performance. This paper provides a methodology to automate the entire lifecycle of index creation and management with continuous index tuning based on changing data and \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hubail:2025:CCJ, author = "Murtadha {Al Hubail} and Ali Alsuliman and Wail Alkowaileet and Michael Blow and Michael Carey and Savyasach Enukonda and Peeyush Gupta and Santosh Hegde and Kamini Jagtiani and Abhishek Jindal and Mohammad Nawazish Khan and Mehnaz Mahin and Ian Maxon and M. Muralikrishna and Keshav Murthy and Daniel Nagy and Preetham Poluparthi and Ankit Prabhu and Ritik Raj and Vijay Sarathy and Shahrzad Shirazi and Utsav Singh and Hussain Towaileb and Ayush Tripathi and Janhavi Tripurwar and Bo-Chun Wang and Till Westmann", title = "Cloudy with a Chance of {JSON}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4938--4950", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750618", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Couchbase Capella is a scalable document-oriented database service in the cloud. Its existing Capella Operational service is based on a shared-nothing architecture and supports high volumes of low-latency queries and updates for JSON documents. Its new \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2025:GVL, author = "Ji Sun and Guoliang Li and James Pan and Jiang Wang and Yongqing Xie and Ruicheng Liu and Wen Nie", title = "{GaussDB-Vector}: a Large-Scale Persistent Real-Time Vector Database for {LLM} Applications", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4951--4963", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750619", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Vector databases are widely used as a fundamental tool for addressing the weaknesses of large language model (LLM) applications, specifically hallucinations and the high cost of inference. However, existing vector databases either cater to niche \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Song:2025:MHA, author = "Jun Song and Jingyi Ding and Irshad Kandy and Yanghao Lin and Zhongjia Wei and Zilong Zhou and Zhiwei Peng and Jixi Shan and Hongyue Mao and Xiuqi Huang and Xun Song and Cheng Chen and Yanjia Li and Tianhao Yang and Wei Jia and Xiaohong Dong and Kang Lei and Rui Shi and Pengwei Zhao and Wei Chen", title = "{Magnus}: a Holistic Approach to Data Management for Large-Scale Machine Learning Workloads", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4964--4977", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750620", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Machine learning (ML) has become a cornerstone of key applications at ByteDance. As model complexity and data volumes surge, data management for large-scale ML workloads faces substantial challenges, particularly with recent advances in large \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gao:2025:DED, author = "Xin Gao and Sibasish Acharya and Sihui Han and Yongxiong Ren and Yanli Zhao and Liang Luo and Chucheng Wang and Pradeep Fernando and Saurabh Mishra and Siqi Yan and Yicong Du and Elzbieta Krepska and Intaik Park and Min Ni and Qunshu Zhang and Shen Li", title = "{DECK}: Experiences on Delta Checkpointing for Industrial Recommendation Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4978--4990", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750621", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In large-scale industrial recommendation systems, model checkpoints are instrumental in maintaining training goodput and numerical correctness during system failures and job preemptions. The increasing prevalence of multi-terabyte models has rendered \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jiang:2025:GPQ, author = "Zhe Jiang and Zhaoguo Wang and Haoning Lan and Chuzhe Tang and Haoran Ding and Lefeng Wang and Songyun Zou and Zhuoran Wei and Yongcun Liu and Xiang Yu and Yang Ren and Guoliang Li and Haibo Chen", title = "{GRewriter}: Practical Query Rewriting with Automatic Rule Set Expansion in {GaussDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "4991--5003", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750622", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Effectively rewriting a wide range of complex and diverse queries is critical for database systems. Huawei GaussDB has been experiencing limited extensibility of its existing query rewriter. The problem is rooted in the need for one-size-fits-all \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2025:FES, author = "Jun-Peng Zhu and Lingfeng Zhang and Peng Cai and Xuan Zhou and Peisen Zhao and Xue Wang and Linpeng Tang", title = "{FDBKeeper}: Enabling Scalable Coordination Services for Metadata Management Using Distributed Key--Value Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5004--5016", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750623", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "High-reliability distributed coordination services have become an indispensable part of modern large-scale distributed systems. Popular coordination services (e.g., ZooKeeper) adopt a single-writer design to provide a centralized service for managing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhong:2025:VOS, author = "Xiaoyao Zhong and Haotian Li and Jiabao Jin and Mingyu Yang and Deming Chu and Xiangyu Wang and Zhitao Shen and Wei Jia and George Gu and Yi Xie and Xuemin Lin and Heng Tao Shen and Jingkuan Song and Peng Cheng", title = "{VSAG}: an Optimized Search Framework for Graph-Based Approximate Nearest Neighbor Search", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5017--5030", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750624", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Approximate nearest neighbor search (ANNS) is a fundamental problem in vector databases and AI infrastructures. Recent graph-based ANNS algorithms have achieved high search accuracy with practical efficiency. Despite the advancements, these algorithms \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Sun:2025:RBL, author = "Zhaoyan Sun and Xuanhe Zhou and Guoliang Li and Xiang Yu and Jianhua Feng and Yong Zhang", title = "{12R-Bot}: an {LLM}-Based Query Rewrite System", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5031--5044", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750625", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query rewrite is essential for optimizing SQL queries to improve their execution efficiency without changing their results. Traditionally, this task has been tackled through heuristic and learning-based methods, each with its limitations in terms of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Schultz:2025:DMV, author = "William Schultz and Murat Demirbas", title = "Design and Modular Verification of Distributed Transactions in {MongoDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5045--5058", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750626", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "MongoDB's distributed multi-document transactions protocol was designed and developed incrementally, building on WiredTiger, an existing single node multi-version storage engine that provided snapshot isolated key-value storage. This layered approach \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2025:SSP, author = "Xinjun Yang and Feifei Li and Yingqiang Zhang and Hao Chen and Qingda Hu and Panfeng Zhou and Qiang Zhang and Shuai Li and Zongzhi Chen and Zheyu Miao and Rongbiao Xie and Chuan Sun and Zetao Wei and Jing Fang and Xingxuan Zhou and Xiaofei Wu", title = "From Scale-Up to Scale-Out: {PolarDB's} Journey to Achieving 2 Billion {tpmC}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5059--5072", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750627", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the past decade, cloud databases have experienced rapid development and growth. PolarDB, Alibaba's cloud-native OLTP database, has evolved significantly to meet the increasing demand for cloud-native architectures and now serves hundreds of thousands \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2025:SSP, author = "Mingyu Liu and Junbin Kang and Kai Wang and Lu Zhang and Haibo Chen and Xiuchang Li and Tianhong Ding", title = "{ScaleCache}: Scalable and Production-Grade Buffer Management for Disk-Based Database Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5073--5085", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750628", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Buffer management is critical for DBMSs but often suffers from scalability bottlenecks and poor cache locality, which stems from centralized reference counting in page access and intensive locking in page-to-buffer translation. However, prior radical \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhu:2025:TAC, author = "Jun-Peng Zhu and Boyan Niu and Peng Cai and Zheming Ni and Jianwei Wan and Kai Xu and Jiajun Huang and Shengbo Ma and Bing Wang and Xuan Zhou and Guanglei Bao and Donghui Zhang and Liu Tang and Qi Liu", title = "Towards Automated Cross-Domain Exploratory Data Analysis through Large Language Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5086--5099", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750629", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Exploratory data analysis (EDA), coupled with SQL, is essential for data analysts involved in data exploration and analysis. However, data analysts often encounter two primary challenges: (1) the need to craft SQL queries skillfully and (2) the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Tong:2025:GAT, author = "Bing Tong and Yan Zhou and Chen Zhang and Jianheng Tang and Jia Li and Lei Chen", title = "{GalaxyWeaver}: Autonomous Table-to-Graph Conversion and Schema Optimization with Large Language Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5100--5112", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750630", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Most enterprise graph data derives from relational databases, yet transforming relational tables into query-optimized graph schemas remains challenging. Existing approaches have notable limitations: (1) transformations based on primary and foreign keys \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gubner:2025:FMB, author = "Tim Gubner and Rune Humborstad and Manyi Lu", title = "Freely Moving between the {OLTP} and {OLAP} Worlds: {Hermes} --- a High-Performance {OLAP} Accelerator for {MySQL}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5113--5125", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750631", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Users often want to run analytics on their OLTP databases, to avoid costly and cumbersome Extract-Transform-Load (ETL) processes. Typically, analytical queries run rather slow on OLTP DBMS, making Hybrid Transaction/Analytic Processing (HTAP) solutions \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Szlang:2025:WIS, author = "Jan Vincent Szlang and Sebastian Bress and Sebastian Cattes and Jonathan Dees and Florian Funke and Max Heimel and Michel Oleynik and Ismail Oukid and Tobias Maltenberger", title = "Workload Insights from the Snowflake Data Cloud: What Do Production Analytic Queries Really Look Like?", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5126--5138", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750632", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Capturing the characteristics of real-world analytical workloads is challenging yet critical for advancing industry practices and academic research. Historically, obtaining accurate query and data characteristics has been difficult, largely because \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2025:APC, author = "Fangyuan Zhang and Caihua Yin and Hua Fan and Fenghua Fang and Yineng Chen and Xuqi Wang and Mengqi Wu and Bing Chen and Tianbo Jin and Sibo Wang and Wenchao Zhou and Feifei Li", title = "{AnalyticDB-PG}: a Cloud-Native High-Performance Data Warehouse in {Alibaba Cloud}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5139--5152", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750633", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In the era of big data, the landscape of data management and analytics has significantly transformed, presenting diverse challenges for cloud platforms. Modern data warehouses face increasing challenges in handling hybrid transactional and analytical \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2025:SVE, author = "Fangyuan Zhang and Mengqi Wu and Chunlei Xu and Yunong Bao and Jiyu Qiao and Yingli Zhou and Hua Fan and Caihua Yin and Wenchao Zhou and Feifei Li", title = "Streaming View: an Efficient Data Processing Engine for Modern Real-Time Data Warehouse of {Alibaba Cloud}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5153--5165", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750634", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Real-time data warehouses are essential for modern applications. Extract-Transform-Load (ETL) as a fundamental component of offline data warehouses also provides crucial support within realtime data warehouses. Among various traditional ETL approaches, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Upreti:2025:CEL, author = "Nitish Upreti and Harsha Vardhan Simhadri and Hari Sudan Sundar and Krishnan Sundaram and Samer Boshra and Balachandar Perumalswamy and Shivam Atri and Martin Chisholm and Revti Raman Singh and Greg Yang and Tamara Hass and Nitesh Dudhey and Subramanyam Pattipaka and Mark Hildebrand and Magdalen Manohar and Jack Moffitt and Haiyang Xu and Naren Datha and Suryansh Gupta and Ravishankar Krishnaswamy and Prashant Gupta and Abhishek Sahu and Hemeswari Varada and Sudhanshu Barthwal and Ritika Mor and James Codella and Shaun Cooper and Kevin Pilch and Simon Moreno and Aayush Kataria and Santosh Kulkarni and Neil Deshpande and Amar Sagare and Dinesh Billa and Zishan Fu and Vipul Vishal", title = "Cost-Effective, Low Latency Vector Search with {Azure Cosmos DB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5166--5183", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750635", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Vector indexing enables semantic search over diverse corpora and has become an important interface to databases for both users and AI agents. Efficient vector search requires deep optimizations in database systems. This has motivated a new class of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Merli:2025:ULN, author = "Matteo Merli and Sijie Guo and Penghui Li and Hang Chen and Neng Lu", title = "{Ursa}: a Lakehouse-Native Data Streaming Engine for {Kafka}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5184--5196", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750636", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data lakehouse architectures unify the cost-efficiency of data lakes with the transactional guarantees of data warehouses. Yet, real-time ingestion often depends on external streaming systems such as Apache Kafka, along with bespoke connectors that read \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Puttaswamy:2025:DSO, author = "Krishna Puttaswamy and Abhijit Chakankar and Tao Tao and Zaheera Valani and Ramesh Chandra and William Chau and Mengxi Chen and Akram Chetibi and Tianyi Huang and Jonathan Keller and Celia Kung and Andy Liu and Charlene Lyu and Samarth Shetty and Xiaotong Sun and Steve Weis and Lin Zhou and Ryan Zhu and Reynold Xin and Matei Zaharia", title = "Delta Sharing: an Open Protocol for Cross-Platform Data Sharing", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5197--5209", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750637", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Organizations across industries increasingly rely on sharing data to drive collaboration, innovation, and business performance. However, securely and efficiently sharing live data across diverse platforms and adhering to varying governance requirements \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lightstone:2025:STA, author = "Sam Lightstone and Ping Wang", title = "{SQL:Trek} Automated Index Design at {Airbnb}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5210--5222", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750638", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Automating index design has been an active area of research for decades due to the significant impact that indexes have on query performance and database efficiency. Existing approaches range from brute-force search to cost-based optimizations and, more \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Baeuerle:2025:TAC, author = "Marc Baeuerle and Thomas Bodner and Martin Boissier and Tilmann Rabl and Ricardo Salazar D{\'\i}az and Florian Schmeller and Nils Strassenburg and Ilin Tolovski and Marcel Weisgut and Wang Yue", title = "{TCO 2}: Analyzing the Carbon Footprint of Database Server Replacements", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5223--5226", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750604", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data centers produce a significant and increasing amount of CO$_2$ emissions. In the past, these have been predominantly due to energy generation for powering data centers. With the transition to energy sources with lower carbon production, the embodied \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Koupil:2025:FHN, author = "Pavel Koupil and J{\'a}chym B{\'a}rt{\'\i}k and Stefan Klessinger and Andr{\'e} Conrad and Stefanie Scherzinger", title = "{FDepHunter}: Harnessing Negative Examples to Expose Fakes and Reveal Ghosts", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5227--5230", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750612", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Functional dependency (FD) discovery is fundamental in data profiling. Inevitably, existing approaches can return fake FDs that hold only coincidentally. Moreover, these approaches fall short of identifying ghost FDs that would be observable in a clean \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kang:2025:VDE, author = "Rong Kang and Shuai Wang and Tieying Zhang and Xianghong Xu and Linhui Xu and Zhimin Liang and Lei Zhang and Rui Shi and Jianjun Chen", title = "{VIDEX}: a Disaggregated and Extensible Virtual Index for the Cloud and {AI} Era", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5231--5234", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750639", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Virtual indexes play a crucial role in database query optimization. However, with the rapid advancement of cloud computing and AI-driven models for database optimization, traditional virtual index approaches face significant challenges. Cloud-native \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yona:2025:DCC, author = "Roi Yona and Jonathan Breitman and Benny Kimelfeld", title = "{DVote}: Constraining Committee Voting with Database Dependencies", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5235--5238", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750640", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Approval-Based Committee (ABC) voting refers to the task of selecting a committee of a desired size, given voter preferences that state the specific candidates that each voter approves of. A voting rule aggregates the voter preferences into a winning \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2025:GCI, author = "Yangxin Fan and Haolai Che and Mingjian Lu and Yinghui Wu", title = "Graph Compression for Interpretable Graph Neural Network Inference At Scale", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5239--5242", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750641", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate ExGIS, a parallel inference query engine to support explainable Graph Neural Network (GNNs) inference analysis in large graphs. (1) For a class of GNNs M$^L$ with at most L layers, and a graph G, ExGIS performs an offline, once-for-all \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2025:QRT, author = "Bingnan Chen and Binyang Dai and Qichen Wang and Ke Yi", title = "Query Running Too Slow? {Rewrite} it with {Quorion}!", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5243--5246", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750642", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We will demonstrate Quorion, a query rewriter with theoretical guarantees and better practical performance. Quorion adopts some of the recently developed query planning methods that provide optimality guarantees, including Yannakakis$^+$, an optimized \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jensen:2025:DMM, author = "S{\o}ren Kejser Jensen and Christian Schmidt Godiksen and Christian Thomsen and Torben Bach Pedersen", title = "Demonstration of {ModelarDB}: Model-Based Management of High-Frequency Time Series Across Edge, Cloud, and Client", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5247--5250", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750643", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Renewable Energy Sources (RESs) are monitored by many high-quality sensors that produce vast amounts of high-frequency time series data. This can be used to increase the renewable energy production and longevity of the RESs, e.g., yaw misalignment \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Lambrecht:2025:DM, author = "Louisa Lambrecht and Tim Findling and Samuel Heid and Marcel Kn{\"u}deler and Torsten Grust", title = "Democratize {MATCH_RECOGNIZE}!", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5251--5254", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750644", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Row pattern matching in terms of the MATCH_RECOGNIZE clause is a powerful and relatively recent feature in SQL that allows users to define regular patterns over ordered rows in a table. As of today, few database systems offer support for match recognize, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Heinrich:2025:OBB, author = "Roman Heinrich and Oleksandr Havrylov and Manisha Luthra and Johannes Wehrstein and Carsten Binnig", title = "Opening the Black-Box: Explaining Learned Cost Models for Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5255--5258", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750645", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Learned Cost Model s (LCMs) have shown superior results over traditional database cost models as they can significantly improve the accuracy of cost predictions. However, LCMs still fail for some query plans, as prediction errors can be large in the tail. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Emani:2025:HRC, author = "Venkatesh Emani and Wenjing Wang and Zi Ye and Jia He and Neel Ball and Kumaraswamy Boora and Carlo Curino and Avrilia Floratou and Manan Goenka and Paridhi Gupta and Vivek Gupta and Katherine Lin and Nick Litombe and Jared Meade and Suryakant Mutnal and Mark Pryce-Maher and Raghu Ramakrishnan and Sudhir Raparla and Dhruv Relwani and Shyam Sai and Vaibhave Sekar and Roneet Shaw and Harmeet Singh and Prasanna Sridharan and Mark Taylor and Sunidhi Tiwari and Yiwen Zhu", title = "{Horizon}: Robust Checks for {SQL} Migration Using {LLMs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5259--5262", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750646", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large language models (LLMs) have recently demonstrated strong capabilities in code migration across languages, making them promising for SQL schema migration. However, achieving reliable and accurate SQL migration with LLMs remains a challenge. This \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2025:DQR, author = "Wenhao Liu and Xiu Tang and Sai Wu and Chang Yao and Gongsheng Yuan and Gang Chen", title = "A Demonstration of {QueryArtisan}: Real-Time Data Lake Analysis via Dynamically Generated Data Manipulation Code", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5263--5266", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750647", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Querying and analyzing data in data lakes requires substantial manual intervention, including numerous data preprocessing steps, and often demands complex domain expertise. However, the advent of Large Language Models (LLMs) has introduced a promising \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cohen:2025:RRS, author = "Dvir Cohen and Liad Domb and Avigdor Gal and Lior Ganon and Eliezer Gavriel and Omri Lazover and Coral Scharf and Bar Shterenberg", title = "{RecForUS}: a Recommender System for Uncertain Scores", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5267--5270", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750648", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present RecForUS, a recommender system designed to offer accurate music recommendations through a competition between participants and an algorithmic recommender. Our framework aims to demonstrate the intricate management of uncertain scores in a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Trudslev:2025:PTI, author = "Frederik Marinus Trudslev and Matteo Lissandrini and Juan Manuel Rodriguez and Martin B{\o}gsted and Daniele Dell'Aglio", title = "{PrivEval}: a Tool for Interactive Evaluation of Privacy Metrics in Synthetic Data Generation", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5271--5274", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750649", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Synthetic data generation (SDG) is the process of generating a new synthetic dataset based on the statistical properties of a confidential existing dataset. Differential privacy is the property of a SDG mechanism that establishes how protected \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Psarakis:2025:SAT, author = "Kyriakos Psarakis and Oto Mraz and George Christodoulou and George Siachamis and Marios Fragkoulis and Asterios Katsifodimos", title = "Styx in Action: Transactional Cloud Applications Made Easy", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5275--5278", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750650", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Developing and deploying transactional cloud applications such as banking and e-commerce systems is a daunting task for developers. The reason for this diffi_culty is twofold. First, developing such applications shifts the developers' focus from the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Marcy:2025:CSK, author = "Mathilde Marcy and Jean-Marc Petit and Vasile-Marian Scuturici and Jocelyn Bonjour and Camille Fertel and Gerald Cavalier", title = "Can Surrogate Keys Negatively Impact Data Quality?", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5279--5282", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750651", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Surrogate keys are now extensively utilized by database designers to implement keys in SQL tables. They are straightforward, easy to understand, enable efficient access, and are often considered a sufficient guarantee of data integrity despite lacking \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hattasch:2025:JJI, author = "Benjamin H{\"a}ttasch and Leon Kr{\"u}ger and Carsten Binnig", title = "{JUSTINE (JUST-INsert Engine)}: Demonstrating Self-Organizing Data Schemas", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5283--5286", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750652", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Relational databases are great for data analysis and exploration, but require a carefully crafted schema, which causes high manual overhead. Moreover, entities not considered during schema design cannot be stored. In contrast, schemaless approaches allow \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2025:USU, author = "Jiayi Wang and Yuan Li and Jianming Wu and Shihui Xu and Guoliang Li", title = "{Unify}: a System For Unstructured Data Analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5287--5290", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750653", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Unstructured data comprises over 80\% of today's information, yet no specialized system effectively supports its semantic analytics. Traditional SQL-based approaches rely on predefined schemas, making them unsuitable. While large language models (LLMs) \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Beischl:2025:UPR, author = "Alexander Beischl and Thomas Neumann", title = "{UmbraPerf} --- Profiling Results Tailored for {DBMS} Developers", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5291--5294", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750654", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Developing a code-generating Database Management System requires tight profiling and performance-tuning iterations. However, existing profilers report results at instruction or function level, making it challenging to correlate them with constructs like \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Mohanaraj:2025:SSA, author = "Abiram Mohanaraj and Matteo Lissandrini and Katja Hose", title = "{Smart SPARQL Advisor}: Guiding Users in Query Formulation with Performance Prediction", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5295--5298", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750655", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Writing SPARQL queries is often an iterative process, where users refine queries until they meet their information needs. However, long-running query executions can lead to inefficient workflows, as users must wait idly for results --- potentially without \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhou:2025:AHD, author = "Jiatang Zhou and Kaisong Huang and Zhuoyue Zhao and Dong Xie and Tianzheng Wang", title = "Analytics Are Heavy. {The} {DBMS} Is Busy. {When} Will My Mission-Critical Transaction Start Running?", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5299--5302", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750656", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Conventional non-preemptive scheduling strategies struggle to meet the latency requirements of mixed workloads: low-priority, long-running analytics can dominate CPU cores while short, high-priority transactions wait a long time to be scheduled. Although \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Veltri:2025:ATI, author = "Enzo Veltri and Donatello Santoro and Jean-Flavien Bussotti and Paolo Papotti", title = "Accelerating Tabular Inference: Training Data Generation with {TENET}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5303--5306", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750657", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Tabular Natural Language Inference (TNLI) involves machine learning models that assess whether structured tabular data supports or contradicts a hypothesis formulated in natural language. TNLI models typically require large sets of training examples, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zhang:2025:ABP, author = "Xukang Zhang and Huanchen Zhang and Xiaofeng Meng", title = "{Accordion}: Balancing Performance and Cost in Cloud--Native Data Analysis with Intra-Query Runtime Elasticity", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5307--5310", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750658", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Cloud databases empower users to leverage vast computing resources for efficient data analysis. However, achieving cost-effective utilization of these resources remains a challenge. Users often struggle to balance computing resource allocation with their \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gu:2025:HSQ, author = "Long Gu and Shaza Zeitouni and Carsten Binnig and Zsolt Istv{\'a}n", title = "How {SMPC} Query Execution Can be Sped Up through Efficient and Flexible Intermediate Result Size Trimming", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5311--5314", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750659", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "There is growing interest in Secure Collaborative Analytics, but fully oblivious query execution in Secure Multi-Party Computation (MPC) settings is prohibitively expensive. Recent related works proposed different approaches to trimming the size of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Gavriilidis:2025:EWF, author = "Haralampos Gavriilidis and Joel Ziegler and Midhun Kaippillil Venugopalan and Benedikt Didrich and Matthias Boehm and Volker Markl", title = "Enter the Warp: Fast and Adaptive Data Transfer with {XDBC}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5315--5318", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750660", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Fast and scalable data transfer is crucial in today's decentralized data ecosystems and data-driven applications, including extraction-transformation-loading (ETL) pipelines, and data science workflows. Transfers often occur across heterogeneous \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zecchini:2025:RDS, author = "Luca Zecchini and Ziawasch Abedjan and Vasilis Efthymiou and Giovanni Simonini", title = "{RadlER}: Deduplicated Sampling On-Demand", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5319--5322", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750661", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data practitioners often need to sample their datasets to produce representative subsets for their downstream tasks. Unfortunately, real-world datasets frequently contain duplicates, whose presence biases sampling and impacts the quality of the produced \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Shinde:2025:MGD, author = "Amey Shinde and Viraj Sabhaya and Kevin Farokhrouz and Fariba Irany and Ali Khan and Sanjukta Bhowmick and Abhishek Santra and Sharma Chakravarthy", title = "{MLN-geeWhiz}: a Dashboard for Supporting Complete Life-Cycle of Complex Data Analysis Using Multilayer Networks", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5323--5326", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750662", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Over the last few decades, simple graphs have been extensively used for studying complex systems of interacting entities from diverse disciplines, such as social networks, transportation, epidemiology, etc. However, when studying data with multiple types \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Xiu:2025:HQH, author = "Haibo Xiu and Yang Li and Qianyu Yang and Weihang Guo and Yuxi Liu and Pankaj K. Agarwal and Sudeepa Roy and Jun Yang", title = "{Hint-QPT}: Hints for Robust Query Performance Tuning", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5327--5330", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750663", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Query optimizers rely heavily on selectivity estimates to choose efficient execution plans, but inaccuracies in these estimates often result in poor query performance. We introduce Hint-QPT (Hints for Robust Query Performance Tuning), an interactive tool \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Agmon:2025:CFC, author = "Shunit Agmon and David Avigdor and Brit Youngmann and Amir Gilad and Benny Kimelfeld", title = "{ClaimIt}: Finding Convincing Views to Endorse a Claim", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5331--5334", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750664", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The demonstration presents ClaimIt --- a tool for extracting views that support a user-provided claim. Such views can assist users in finding evidence of phenomena of interest, criticizing given claims by proposing opposing viewpoints, inspecting the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Jezek:2025:DBQ, author = "Filip Je{\v{z}}ek and Pavel Koupil and Michal Kopeck{\'y} and J{\'a}chym B{\'a}rt{\'\i}k and Irena Holubov{\'a}", title = "{DortDB}: Bridging Query Languages for Multi-Model Data Ponds", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5335--5338", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750665", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Multi-model data encompasses structurally distinct data, including relational, document, graph, key/value, columnar, etc., managed within a single system, such as a multi-model database or a data lake. Querying multi-model data requires strategies that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Qian:2025:DMO, author = "Zekai Qian and Xiaoou Ding and Chen Wang and Hongzhi Wang", title = "{DemandClean}: a Multi-Objective Learning Framework for Balancing Model Tolerance to Data Authenticity and Diversity", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5339--5342", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750666", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Real-world datasets often suffer from multiple quality issues, hindering downstream model performance and increasing cleaning costs. To address this, we propose DemandClean, a reinforcement learning-based adaptive data cleaning framework that dynamically \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ding:2025:TTA, author = "Xiaoou Ding and Yanshuo Liu and Zhounan Chen and Hongzhi Wang and Chen Wang and Jianmin Wang", title = "{TARImpute}: Task-Aware Auto-Recommender System for Missing Value Imputation Algorithms with Clustering Case Studies", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5343--5346", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750667", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Missing data prevalent in information systems impacts data diversity and fidelity, which systematically degrade clustering performance through biased similarity measures and unstable cluster boundaries. Current large-scale environments lack standardized \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Huang:2025:DPI, author = "Yuchuan Huang and Ana Elena Uribe and Youssef Hussein and Grant Ogren and Kareem Eldahshoury and Mohamed F. Mokbel", title = "A Demonstration of {Polaris}: an Interactive and Scalable Data Infrastructure for Polar Science", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5347--5350", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750668", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This demonstration presents Polaris; a novel open-source system infrastructure for Polar science that is highly Interactive and Scalable. Polaris is designed based on three observations that distinguish the query workload of polar scientists, namely, all \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chen:2025:GDE, author = "Zixuan Chen and Jinyang Li and H. V. Jagadish and Mirek Riedewald", title = "{GooseDB}: a Database Engine that Optimally Refines Top-$k$ Queries to Satisfy Representation Constraints", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5351--5354", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750669", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In many applications, from university rankings to the selection of candidates for a job interview, there exist various ``reasonable'' ways to filter the data and generate a ranking. When the initial choice lacks certain desirable properties, we want to \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ntouni:2025:NNC, author = "Ourania Ntouni and Dimitrios Banelas and Nikos Giatrakos", title = "{NeuroFlinkCEP}: Neurosymbolic Complex Event Recognition Optimized across {IoT} Platforms", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5355--5358", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750670", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate NeuroFlinkCEP, the first framework that integrates neural and symbolic Complex Event Recognition (CER) over a state-of-the-art Big Data platform, also optimizing neurosymbolic CER upon operating over IoT settings. NeuroFlinkCEP receives \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Grafberger:2025:MII, author = "Stefan Grafberger and Paul Groth and Sebastian Schelter", title = "{mlidea}: Interactively Improving {ML} Data Preparation Code via ``Shadow Pipelines''", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5359--5362", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750671", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data scientists develop ML pipelines in an iterative manner: they repeatedly screen a pipeline for potential issues, debug it, and then revise and improve its code according to their findings. However, this manual process is tedious and error-prone. To \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Koehler:2025:MMK, author = "Henning Koehler and Sebastian Link", title = "Mining Meaningful Keys and Foreign Keys with High Precision and Recall", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5363--5366", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750672", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We demonstrate a next-generation Entity/Relationship (E/R) Profiler that mines meaningful key/foreign key relationships from a given data repository. Core novelties include a strict hierarchy of key variants ranging from candidate keys to SQL unique \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Benjira:2025:SKF, author = "Wissal Benjira and Nicolas Travers and Faten Atigui and B{\'e}n{\'e}dicte Bucher and Malika Grim-Yefsah", title = "{SDG-KG}: a Framework to Compute {SDG} Indicators with Open Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5367--5370", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750673", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Monitoring Sustainable Development Goal (SDG) indicators requires integrating heterogeneous open datasets from sources such as relational databases, NoSQL stores, and APIs. While SDG indicators follow standardized definitions, open data sources are often \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fan:2025:FPP, author = "Zeheng Fan and Yuxiang Zeng and Zhuanglin Zheng and Yongxin Tong", title = "{FedVSE}: a Privacy-Preserving and Efficient Vector Search Engine for Federated Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5371--5374", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750674", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Efficient vector search is a foundational capability of vector databases. However, most prior research overlooks its critical role in federated databases for applications like financial risk control and smart healthcare. In these privacy-sensitive \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Eggers:2025:ADL, author = "Sebastian Eggers and Nina Zukowska and Ziawasch Abedjan", title = "{APEX-DAG}: Library and Language independent Pipeline {EXtraction}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5375--5378", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750675", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Modern data-driven systems often rely on complex pipelines to process and transform data for downstream machine learning (ML) tasks. Extracting these pipelines and understanding their structure is critical for ensuring transparency, performance \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Ahmadi:2025:DMM, author = "Fatemeh Ahmadi and Julian Paulu{\ss}en and Ziawasch Abedjan", title = "Demonstrating {Matelda} for Multi-Table Error Detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5379--5382", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750676", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Real-world datasets are often fragmented across multiple heterogeneous tables, managed by different teams or organizations. Ensuring data quality in such environments is challenging, as traditional error detection tools typically operate on isolated \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wu:2025:DGB, author = "Qingliu Wu and Qingfeng Xiang and Yingxia Shao and Qiyao Luo and Quanqing Xu", title = "{DBPecker}: a Graph-Based Compound Anomaly Diagnosis System for Distributed {RDBMSs}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5383--5386", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750677", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This demonstration introduces DBPecker, an integrated diagnostic platform tailored for distributed relational database systems. DBPecker leverages a graph-based anomaly modeling approach to capture inter-node dependencies and effectively localize \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Li:2025:DDU, author = "Zequn Li and Yuanhao Zhong and Chengliang Chai and Zhaoze Sun and Yuhao Deng and Ye Yuan and Guoren Wang and Lei Cao", title = "{DocDB}: a Database for Unstructured Document Analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5387--5390", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750678", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Recent studies have developed LLM-powered data systems that enable database-like analysis of unstructured text documents. While LLMs excel at attribute extraction from documents, their high computational costs and latency make extraction operations the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yan:2025:CCA, author = "Jianxin Yan and Wangze Ni and Lei Chen and Xuemin Lin and Peng Cheng and Zhan Qin and Kui Ren", title = "{ContextCache}: Context-Aware Semantic Cache for Multi-Turn Queries in Large Language Models", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5391--5394", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750679", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Semantic caching significantly reduces computational costs and improves efficiency by storing and reusing large language model (LLM) responses. However, existing systems rely primarily on matching individual queries, lacking awareness of multi-turn \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zacouris:2025:STS, author = "Zenon Zacouris and Maribel Acosta", title = "Simulating a Transactional Server for Multi-Model Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5395--5398", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750680", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Multi-model systems integrate heterogeneous models, making consistency management a critical challenge. We present M2TS, a transactional server simulator for multi-model environments, enabling users to analyze the impact of consistency-preserving \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cui:2025:TTA, author = "Lingxi Cui and Guanyu Jiang and Huan Li and Ke Chen and Lidan Shou and Gang Chen", title = "{TableCopilot}: a Table Assistant Empowered by Natural Language Conditional Table Discovery", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5399--5402", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750681", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The rise of LLM has enabled natural language-based table assistants, but existing systems assume users already have a well-formed table, neglecting the challenge of table discovery in large-scale table pools. To address this, we introduce TableCopilot, \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cao:2025:LSL, author = "Shuting Cao and Zeping Niu and Guoliang Li", title = "{LETIndex}: a Secure Learned Index with {TEE}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5403--5406", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750682", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Trusted execution environment (TEE) offers a promising approach to building encrypted databases, which keep data confidential for users. However, designing an efficient index for TEE databases remains a significant challenge. Due to the limited enclave \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Fama:2025:PWP, author = "Mauro Fam{\`a} and Alessandro Ferri and Samuele Langhi and Riccardo Tommasini and Angela Bonifati", title = "{Play2Win}: a Windowing Playground for Continuous Queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5407--5410", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750683", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Continuous Queries (CQs) are designed to operate over infinite data streams; the paradigm gained prominence with the rise of Stream Processing (SPs). Central to CQs are window operators as they enforce bounded computation by partitioning streams into \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bellomarini:2025:VLF, author = "Luigi Bellomarini and Andrea Gentili and Davide Magnanimi and Emanuel Sallinger", title = "{Vadacode}: a Logician-Friendly {IDE} for {Datalog$^\pm $}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5411--5414", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750684", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Languages, namely, fragments, of the Datalog+/- family are attracting interest in both academia and industry because of their possibility to balance high expressive power and computational complexity. However, understanding the differences among the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Dorbani:2025:BQD, author = "Anas Dorbani and Sunny Yasser and Jimmy Lin and Amine Mhedhbi", title = "Beyond Quacking: Deep Integration of Language Models and {RAG} into {DuckDB}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5415--5418", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750685", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Knowledge-intensive analytical applications retrieve context from both structured tabular data and unstructured free text documents for effective decision-making. Large language models (LLMs) have significantly simplified the prototyping of such \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yang:2025:SVS, author = "Fan Yang and John Paparrizos", title = "{SAIL}: a Voyage to Symbolic Approximation Solutions for Time-Series Analysis", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5419--5422", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750686", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Symbolic Approximation, a dimensionality reduction technique that transforms time series into discrete symbols, has gained increasing attention in various downstream applications. Despite decades of development, there is a noticeable absence of a \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Warner:2025:BDM, author = "Annabelle Warner and Andrew McNutt and Paul Rosen and El Kindi Rezig", title = "{Buckaroo}: a Direct Manipulation Visual Data Wrangler", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5423--5426", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750687", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Preparing datasets-a critical phase known as data wrangling-constitutes the dominant phase of data science development, consuming upwards of 80\% of the total project time. This phase encompasses a myriad of tasks: parsing data, restructuring it for \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Khatri:2025:SIL, author = "Akash Khatri and Mir Mahathir Mohammad and El Kindi Rezig", title = "Sort it Like You Mean It: Discovering Semantically Interesting Attribute Augmentations to Sort Tables", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5427--5430", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750688", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Sorting is a fundamental operation in table analysis. Data scientists frequently sort tables to uncover key insights-for example, identifying the top 10 products by sales. However, this process is largely manual. Data scientists must (1) understand the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2025:EDA, author = "Qinghua Liu and Seunghak Lee and John Paparrizos", title = "{EasyAD}: a Demonstration of Automated Solutions for Time-Series Anomaly Detection", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5431--5434", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750689", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Despite the recent focus on time-series anomaly detection, the effectiveness of the proposed anomaly detectors is restricted to specific domains. A model that performs well on one dataset may not perform well on another. Therefore, how to develop \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bahadori:2025:LLA, author = "Tarlan Bahadori and Sai Sreekar Sarvepalli and Ahmed Eldawy", title = "{LASEK}: {LLM}-Assisted Style Exploration Kit for Geospatial Data", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5435--5438", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750690", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Geospatial data visualization on a map is an essential tool for modern data exploration tools. However, these tools require users to manually configure the visualization style including color scheme and attribute selection, a process that is both complex \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Liu:2025:DQQ, author = "Hanwen Liu and Federico Spedalieri and Ibrahim Sabek", title = "A Demonstration of {Q$^2$O}: Quantum-Augmented Query Optimizer", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5439--5443", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750691", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The join order (JO) optimization problem is a key challenge in query optimization. Classical approaches can compute the optimal solution for smaller queries. For larger queries, some heuristic methods trade off plan quality to reduce the exponential \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Skavantzos:2025:WER, author = "Philipp Skavantzos and Sebastian Link", title = "When Entity\slash Relationship Models Meet Graph Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5444--5447", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750617", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This tutorial shows how traditional Entity/Relationship modeling and modern graph data modeling can be combined to bring forward well-designed graph data models that process workloads and maintain data integrity efficiently.", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Cormode:2025:STD, author = "Graham Cormode and Shripad Gade and Samuel Maddock and Enayat Ullah", title = "Synthetic Tabular Data: Methods, Attacks and Defenses", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5448--5450", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750692", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Synthetic data is often positioned as a solution to replace sensitive fixed-size data sets with a source of unlimited matching data, freed from privacy concerns. There has been much progress in synthetic data generation over the last decade, leveraging \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Hussein:2025:LLM, author = "Youssef Hussein and Mohamed Hemdan and Mohamed F. Mokbel", title = "Large Language Models for Spatial Analysis Queries", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5451--5454", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750693", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This tutorial provides a comprehensive overview of the research landscape of employing Large Language Models (LLMs) to spatial analysis queries. The tutorial categorizes the research in this area based on how LLMs are employed to serve such queries. This \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Abedjan:2025:DDD, author = "Ziawasch Abedjan and Mahdi Esmailoghli and Sainyam Galhotra", title = "Data Discovery in Data Lakes: Operations, Indexes, Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5455--5459", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750694", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Data discovery has gained significant traction in the database community resulting in various discovery operations, index schemes, and discovery systems. This tutorial explores the architecture and components of data discovery systems, focusing on \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yan:2025:SSG, author = "Da Yan and Lyuheng Yuan and Akhlaque Ahmad and Saugat Adhikari", title = "Systems for Scalable Graph Analytics and Machine Learning: Trends and Methods", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5460--5465", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750695", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Graph-theoretic algorithms and graph machine learning models are essential tools for addressing many real-life problems, such as social network analysis and bioinformatics. To support large-scale graph analytics, graph-parallel systems have been actively \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Luo:2025:NLS, author = "Yuyu Luo and Guoliang Li and Ju Fan and Chengliang Chai and Nan Tang", title = "Natural Language to {SQL}: State of the Art and Open Problems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5466--5471", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750696", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Translating users' natural language queries (nl) into sql queries ( i.e., nl2sql) can significantly reduce barriers to accessing relational databases and support various commercial applications. The performance of nl2sql has been greatly improved with the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Rico:2025:NTD, author = "Ramon Rico and Arno Siebes and Yannis Velegrakis", title = "New Trends in Data Forgetting for Sustainable Data Management", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5472--5476", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750697", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Our ability to collect data is rapidly surpassing our ability to store it. As a result, organizations are faced with difficult decisions about what data to retain, and in what form, in order to meet their business goals while complying with storage \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Kondylakis:2025:PGS, author = "Haridimos Kondylakis and Stefania Dumbrava and Matteo Lissandrini and Nikolay Yakovets and Angela Bonifati and Vasilis Efthymiou and George Fletcher and Dimitris Plexousakis and Riccardo Tommasini and Georgia Troullinou and Elisjana Ymeralli", title = "Property Graph Standards: State of the Art \& Open Challenges", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5477--5481", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750698", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Property Graphs are a versatile and expressive data model that has gained widespread adoption due to their flexibility in supporting labeled and attributed nodes and edges. They are well-established in research communities and are becoming widespread in \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Heinrich:2025:LCM, author = "Roman Heinrich and Xiao Li and Manisha Luthra and Zoi Kaoudi", title = "Learned Cost Models for Query Optimization: From Batch to Streaming Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5482--5487", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750699", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Learned cost models (LCMs) have recently gained traction as a promising alternative to traditional cost estimation techniques in data management, offering improved accuracy by capturing complex interactions between queries, data, and runtime behavior. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Chronis:2025:FVS, author = "Yannis Chronis and Helena Caminal and Yannis Papakonstantinou and Fatma {\"O}zcan and Anastasia Ailamaki", title = "Filtered Vector Search: State-of-the-Art and Research Opportunities", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5488--5492", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750700", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This tutorial provides a comprehensive overview of filtered vector search (fvs). Fvs queries combine vector search with relational operators. The tutorial explores the challenges of integrating vector search into database engines and emphasizes the need \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2025:MAM, author = "Mengying Wang and Moming Duan and Yicong Huang and Chen Li and Bingsheng He and Yinghui Wu", title = "{ML}-Asset Management: Curation, Discovery, and Utilization", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5493--5498", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750701", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Machine learning (ML) assets, such as models, datasets, and metadata-are central to modern ML workflows. Despite their explosive growth in practice, these assets are often underutilized due to fragmented documentation, siloed storage, inconsistent \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Wang:2025:MLG, author = "Hanchen Wang and Ying Zhang and Wenjie Zhang", title = "Machine Learning for Graph Data Management and Query Processing", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5499--5503", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750702", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Machine learning techniques have been proposed to optimize the performance of graph databases in recent years. Due to the NP-hardness of graph database tasks and the complexity of graph data, traditional exact solutions usually encounter efficiency \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Pan:2025:DPL, author = "James Pan and Guoliang Li", title = "Database Perspective on {LLM} Inference Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5504--5507", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3750703", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Large language models (LLMs) are powering a new wave of language-based applications, including database applications, leading to new techniques and systems for dealing with the enormous compute and memory needs of LLMs, coupled with advances in computing \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Bonifati:2025:VPG, author = "Angela Bonifati", title = "Versatile Property Graph Transformations", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5516--5526", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3760517", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Property graphs are key components of modern graph database systems as well as graph analytical systems. They support highly expressive data models consisting of multi-labeled nodes and edges, along with properties represented as key/value pairs. \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Yu:2025:DNA, author = "Xiangyao Yu", title = "{Disaggregation}: a New Architecture for Cloud Databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5527--5530", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3760520", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Disaggregation-the separation of database components into independently managed and scalable services-has emerged as a foundational architecture for cloud-native databases. It enables key benefits such as elasticity, resource pooling, and cost \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Leis:2025:SAH, author = "Viktor Leis and Andrey Gubichev and Atanas Mirchev and Peter Boncz and Alfons Kemper and Thomas Neumann", title = "Still Asking: How Good Are Query Optimizers, Really?", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5531--5536", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3760521", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "This retrospective revisits our 2015 PVLDB paper How Good Are Query Optimizers, Really?, which challenged the prevailing notion that query optimization was a solved problem. By designing the Join Order Benchmark (JOB) and conducting a series of \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Idreos:2025:AGC, author = "Stratos Idreos", title = "Alphabets, Grammars, Calculators, and the End of Hand-Crafted Systems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5537--5537", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3760522", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The AI revolution is transforming every scientific field and business sector, driving an unprecedented demand for data-centric computation. As new data types, hardware platforms, and workloads appear faster than ever before, the backbone systems that \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Freire:2025:BDD, author = "Juliana Freire", title = "Bridging Disciplines in Data Management Research to Solve Complex Data Problems", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5538--5538", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3760523", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Scientific discovery has undergone profound transformations across multiple paradigms, each bringing new data challenges whose solutions demand bridging multiple areas of computer science. This talk presents a research journey spanning three scientific \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Zaharia:2025:BOA, author = "Matei Zaharia", title = "Bringing the Operational and Analytical Worlds Together with {Lakebase}", journal = j-PROC-VLDB-ENDOWMENT, volume = "18", number = "12", pages = "5539--5539", month = aug, year = "2025", CODEN = "????", DOI = "https://doi.org/10.14778/3750601.3760524", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Fri Oct 3 16:56:21 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As database workloads increasingly move into large shared-nothing cloud datacenters, the bits storing operational data, analytical tables, streams, etc all sit together on the same disks in the cloud. This creates new opportunities to unify the \ldots{}", acknowledgement = ack-nhfb, ajournal = "Proc. VLDB Endowment", fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } %%% [03-Oct-2025] TO DO: v13n11 (July 2020) is STILL not yet published %%% [03-Oct-2025] TO DO: v18n12 may still be in progress %%% WARNING: Check page gaps for missing entries at end of each issue: %%% many Web pages hide blocks of articles, and require manual checking %%% of boxes to make them visible!