We describe a system that incrementally translates SPARQL queries to Pig Latin and executes them on a Hadoop
cluster. This system is designed to work eciently on complex queries with many self-joins over huge datasets, avoiding
job failures even in the case of joins with unexpected high-value skew. To be robust against cost estimation errors, our
system interleaves query optimization with query execution, determining the next steps to take based on data samples and
statistics gathered during the previous step. Furthermore, we have developed a novel skew-resistant join algorithm that
replicates tuples corresponding to popular keys. We evaluate the eectiveness of our approach both on a synthetic benchmark
known to generate complex queries (BSBM-BI) as well as on a Yahoo! case of data analysis using RDF data crawled from the
web. Our results indicate that our system is indeed capable of processing huge datasets without precomputed statistics while
exhibiting good load-balancing properties.
%0 Conference Paper
%1 20299
%A Kotoulas, S
%A Urbani, J
%A Boncz, P. A.
%A Mika, P
%B Proceedings of International Semantic Web Conference 2012
%D 2012
%I Springer
%K ldbc-related lod2page myown sysrelevantforlod2
%T Robust Runtime Optimization And Skew-Resistant Execution Of Analytical SPARQL Queries On Pig
%U http://oai.cwi.nl/oai/asset/20299/20299B.pdf
%X We describe a system that incrementally translates SPARQL queries to Pig Latin and executes them on a Hadoop
cluster. This system is designed to work eciently on complex queries with many self-joins over huge datasets, avoiding
job failures even in the case of joins with unexpected high-value skew. To be robust against cost estimation errors, our
system interleaves query optimization with query execution, determining the next steps to take based on data samples and
statistics gathered during the previous step. Furthermore, we have developed a novel skew-resistant join algorithm that
replicates tuples corresponding to popular keys. We evaluate the eectiveness of our approach both on a synthetic benchmark
known to generate complex queries (BSBM-BI) as well as on a Yahoo! case of data analysis using RDF data crawled from the
web. Our results indicate that our system is indeed capable of processing huge datasets without precomputed statistics while
exhibiting good load-balancing properties.
@inproceedings{20299,
abstract = {We describe a system that incrementally translates SPARQL queries to Pig Latin and executes them on a Hadoop
cluster. This system is designed to work eciently on complex queries with many self-joins over huge datasets, avoiding
job failures even in the case of joins with unexpected high-value skew. To be robust against cost estimation errors, our
system interleaves query optimization with query execution, determining the next steps to take based on data samples and
statistics gathered during the previous step. Furthermore, we have developed a novel skew-resistant join algorithm that
replicates tuples corresponding to popular keys. We evaluate the eectiveness of our approach both on a synthetic benchmark
known to generate complex queries (BSBM-BI) as well as on a Yahoo! case of data analysis using RDF data crawled from the
web. Our results indicate that our system is indeed capable of processing huge datasets without precomputed statistics while
exhibiting good load-balancing properties.},
added-at = {2012-08-20T16:16:29.000+0200},
author = {Kotoulas, S and Urbani, J and Boncz, P. A. and Mika, P},
biburl = {https://www.bibsonomy.org/bibtex/275f9013e3b25cb9afb9bd4849a0c71f4/peterboncz},
booktitle = {Proceedings of International Semantic Web Conference 2012},
conferencedate = {2012},
conferencelocation = {Boston, USA},
conferencetitle = {International Semantic Web Conference},
group = {INS1},
interhash = {59cb0e47ece7ed628090c4a9c57cce90},
intrahash = {75f9013e3b25cb9afb9bd4849a0c71f4},
keywords = {ldbc-related lod2page myown sysrelevantforlod2},
language = {en},
month = {November},
publisher = {Springer},
refereed = {y},
timestamp = {2013-09-13T09:41:37.000+0200},
title = {Robust {Runtime} {Optimization} {And} {Skew-}{Resistant} {Execution} {Of} {Analytical} {SPARQL} {Queries} {On} {Pig}},
url = {http://oai.cwi.nl/oai/asset/20299/20299B.pdf},
year = 2012
}