@inproceedings{conf/jcdl/HolzmannGA16, added-at = {2021-08-11T00:00:00.000+0200}, author = {Holzmann, Helge and Goel, Vinay and Anand, Avishek}, biburl = {https://www.bibsonomy.org/bibtex/25f991141cbf714d2d9d863618ed71a01/dblp}, booktitle = {JCDL}, crossref = {conf/jcdl/2016}, editor = {Adam, Nabil R. and Cassel, Lillian (Boots) and Yesha, Yelena and Furuta, Richard and Weigle, Michele C.}, ee = {https://ieeexplore.ieee.org/document/7559568/}, interhash = {d47ccaf56b0796fef5598c8cbdd386bc}, intrahash = {5f991141cbf714d2d9d863618ed71a01}, isbn = {978-1-4503-4229-2}, keywords = {dblp}, pages = {83-92}, publisher = {ACM}, timestamp = {2024-04-09T12:35:47.000+0200}, title = {ArchiveSpark: Efficient Web Archive Access, Extraction and Derivation.}, url = {http://dblp.uni-trier.de/db/conf/jcdl/jcdl2016.html#HolzmannGA16}, year = 2016 } @inproceedings{Holzmann:2016:AEW:2910896.2910902, abstract = {Web archives are a valuable resource for researchers of various disciplines. However, to use them as a scholarly source, researchers require a tool that provides efficient access to Web archive data for extraction and derivation of smaller datasets. Besides efficient access we identify five other objectives based on practical researcher needs such as ease of use, extensibility and reusability. Towards these objectives we propose ArchiveSpark, a framework for efficient, distributed Web archive processing that builds a research corpus by working on existing and standardized data formats commonly held by Web archiving institutions. Performance optimizations in ArchiveSpark, facilitated by the use of a widely available metadata index, result in significant speed-ups of data processing. Our benchmarks show that ArchiveSpark is faster than alternative approaches without depending on any additional data stores while improving usability by seamlessly integrating queries and derivations with external tools.}, acmid = {2910902}, added-at = {2016-09-22T14:50:40.000+0200}, address = {New York, NY, USA}, author = {Holzmann, Helge and Goel, Vinay and Anand, Avishek}, biburl = {https://www.bibsonomy.org/bibtex/230ff9fb37535ccb8a62b9afbcf74432e/alexandriaproj}, booktitle = {Proceedings of the 16th ACM/IEEE-CS on Joint Conference on Digital Libraries}, doi = {10.1145/2910896.2910902}, interhash = {d47ccaf56b0796fef5598c8cbdd386bc}, intrahash = {30ff9fb37535ccb8a62b9afbcf74432e}, isbn = {978-1-4503-4229-2}, keywords = {alexandria}, location = {Newark, New Jersey, USA}, numpages = {10}, pages = {83--92}, publisher = {ACM}, series = {JCDL '16}, timestamp = {2017-02-07T15:20:44.000+0100}, title = {ArchiveSpark: Efficient Web Archive Access, Extraction and Derivation}, url = {https://arxiv.org/abs/1702.01015}, year = 2016 } @inproceedings{Holzmann:2016:AEW:2910896.2910902, abstract = {Web archives are a valuable resource for researchers of various disciplines. However, to use them as a scholarly source, researchers require a tool that provides efficient access to Web archive data for extraction and derivation of smaller datasets. Besides efficient access we identify five other objectives based on practical researcher needs such as ease of use, extensibility and reusability. Towards these objectives we propose ArchiveSpark, a framework for efficient, distributed Web archive processing that builds a research corpus by working on existing and standardized data formats commonly held by Web archiving institutions. Performance optimizations in ArchiveSpark, facilitated by the use of a widely available metadata index, result in significant speed-ups of data processing. Our benchmarks show that ArchiveSpark is faster than alternative approaches without depending on any additional data stores while improving usability by seamlessly integrating queries and derivations with external tools.}, acmid = {2910902}, added-at = {2016-09-10T10:01:07.000+0200}, address = {New York, NY, USA}, author = {Holzmann, Helge and Goel, Vinay and Anand, Avishek}, biburl = {https://www.bibsonomy.org/bibtex/230ff9fb37535ccb8a62b9afbcf74432e/helgeho}, booktitle = {Proceedings of the 16th ACM/IEEE-CS on Joint Conference on Digital Libraries}, description = {ArchiveSpark}, doi = {10.1145/2910896.2910902}, interhash = {d47ccaf56b0796fef5598c8cbdd386bc}, intrahash = {30ff9fb37535ccb8a62b9afbcf74432e}, isbn = {978-1-4503-4229-2}, keywords = {alexandria myown sysrelevantforl3s webarchiving}, location = {Newark, New Jersey, USA}, numpages = {10}, pages = {83--92}, publisher = {ACM}, series = {JCDL '16}, timestamp = {2016-09-10T10:43:05.000+0200}, title = {ArchiveSpark: Efficient Web Archive Access, Extraction and Derivation}, url = {http://doi.acm.org/10.1145/2910896.2910902}, year = 2016 }