@article{journals/tcdl/GossenDR15, added-at = {2021-02-17T00:00:00.000+0100}, author = {Gossen, Gerhard and Demidova, Elena and Risse, Thomas}, biburl = {https://www.bibsonomy.org/bibtex/20f871178e3e44088f9bfaad38dad7c9d/dblp}, ee = {https://bulletin.jcdl.org/Bulletin/v11n2/papers/gossen.pdf}, interhash = {febedff82a8ea1d6bdac7090a1be0c5c}, intrahash = {0f871178e3e44088f9bfaad38dad7c9d}, journal = {Bull. IEEE Tech. Comm. Digit. Libr.}, keywords = {dblp}, number = 2, timestamp = {2024-04-09T00:39:41.000+0200}, title = {The iCrawl System for Focused and Integrated Web Archive Crawling.}, url = {http://dblp.uni-trier.de/db/journals/tcdl/tcdl11.html#GossenDR15}, volume = 11, year = 2015 } @article{gossen2015icrawl, abstract = {The large size of the Web makes it infeasible for many institutions to collect, store and process archives of the entire Web. Instead, many institutions focus on creating archives of specific subsets of the Web. These subsets may be based around specific topics or events. Our iCrawl system provides a focused crawler that is able to automatically collect Web pages relevant to a topic based on content similarity. Recently, the archiving of Social Media platforms like Twitter has become relevant. Our system can conduct integrated crawls that collect Web pages and Social Media posts concurrently. During such a crawl newly discovered URLs are exchanged between the crawling subsystems. We built the system with the goal to enable domain experts to create archives for their topics of interest. Therefore the system is highly automated and provides support for specifying and conducting crawls. We will demonstrate an easy to use interface for crawl specification that allows users to find seed URLs as well as descriptive keywords using Web and Social Media search APIs. The iCrawl system is available as Open Source software.}, added-at = {2016-02-26T10:30:08.000+0100}, author = {Gossen, Gerhard and Demidova, Elena and Risse, Thomas}, biburl = {https://www.bibsonomy.org/bibtex/21fe7cddbda5ab187467e7a0f1c7a3ad2/trisse69}, interhash = {febedff82a8ea1d6bdac7090a1be0c5c}, intrahash = {1fe7cddbda5ab187467e7a0f1c7a3ad2}, journal = {Bulletin of IEEE Technical Committee on Digital Libraries}, keywords = {alexandria crawler icrawl myown semantic webarchiving}, number = 2, timestamp = {2016-06-06T18:11:36.000+0200}, title = {The iCrawl System for Focused and Integrated Web Archive Crawling}, url = {http://www.ieee-tcdl.org/Bulletin/v11n2/papers/gossen.pdf}, volume = 11, year = 2015 }