@article{Bharat-Mirror-1999, title = {Mirror, Mirror on the Web: A Study of Host Pairs with Replicated Content}, author = {Krishna Bharat and Andrei Z. Broder}, journal = {Computer Networks}, number = {11-16}, pages = {1579-1590}, volume = 31, year = 1999, url = {http://dblp.uni-trier.de/db/journals/cn/cn31.html#BharatB99}, biburl = {http://www.bibsonomy.org/bibtex/2a6610a9148781299cbb76396c0c520bb/olhah}, keywords = {content duplicate Web Content wismasys0809 Replicated} } @article{Bharat-Mirror-1999, title = {Mirror, Mirror on the Web: A Study of Host Pairs with Replicated Content}, author = {Krishna Bharat and Andrei Z. Broder}, journal = {Computer Networks}, number = {11-16}, pages = {1579-1590}, volume = 31, year = 1999, url = {http://dblp.uni-trier.de/db/journals/cn/cn31.html#BharatB99}, biburl = {http://www.bibsonomy.org/bibtex/2a6610a9148781299cbb76396c0c520bb/robo}, keywords = {content pairs duplicate ohst wismasys0809} } @article{Bharat-Mirror-1999, title = {Mirror, Mirror on the Web: A Study of Host Pairs with Replicated Content}, author = {Krishna Bharat and Andrei Z. Broder}, journal = {Computer Networks}, number = {11-16}, pages = {1579-1590}, volume = 31, year = 1999, url = {http://dblp.uni-trier.de/db/journals/cn/cn31.html#BharatB99}, biburl = {http://www.bibsonomy.org/bibtex/2a6610a9148781299cbb76396c0c520bb/juver}, keywords = {content replicated duplicate information_science web wismasys0809} } @inproceedings{Padmasree06, title = {Signature Based Duplicate Detection in Digital Libraries}, address = {Alexandria}, author = {Lam Padmasree and Vamshi Ambati and Jasthi Anand Chandulal and Meda Sreenivasa Rao}, booktitle = {Proceedings of the 2nd ICUDL}, editor = {Ismail Serageldin and Raj Reddy}, year = 2006, url = {http://www.ulib.org/conference/2006/25.pdf}, biburl = {http://www.bibsonomy.org/bibtex/26369260b8ed58d9445b8d2df0a1864f4/nichtich}, keywords = {duplicate library bibkey signature} } @article{newman2004crr, title = {{Comparing Redundancy Removal Techniques for Multi-Document Summarisation}}, author = {E. Newman and W. Domn and N. Stokes and J. Carthy and J. Dunnion}, journal = {Stairs 2004: Proceedings of the Second Starting AI Researchers' Symposium}, publisher = {IOS Press}, year = 2004, biburl = {http://www.bibsonomy.org/bibtex/2eda54bb84022c47e3e25234a2ef3221d/renew}, keywords = {removal detection duplicate summarization sentence redundancy} } @inproceedings{bilenko03duplicate, title = {Adaptive Duplicate Detection Using Learnable String Similarity Measures}, author = {Mikhail Bilenko and Raymond J. Mooney}, booktitle = {Proceedings of the 9th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD-2003)}, pages = {39--48}, year = 2003, biburl = {http://www.bibsonomy.org/bibtex/23e1cf1edef2e9e9118d1d120183c34f9/philipp}, keywords = {duplicate similarity svn} } @inproceedings{1060753, title = {Duplicate detection in click streams}, address = {New York, NY, USA}, author = {Ahmed Metwally and Divyakant Agrawal and Amr El Abbadi}, booktitle = {WWW '05: Proceedings of the 14th international conference on World Wide Web}, pages = {12--21}, publisher = {ACM}, year = 2005, url = {http://portal.acm.org/citation.cfm?id=1060745.1060753}, location = {Chiba, Japan}, isbn = {1-59593-046-9}, doi = {http://doi.acm.org/10.1145/1060745.1060753}, description = {Duplicate detection in click streams}, abstract = {We consider the problem of finding duplicates in data streams. Duplicate detection in data streams is utilized in various applications including fraud detection. We develop a solution based on Bloom Filters [9], and discuss the space and time requirements for running the proposed algorithm in both the contexts of sliding, and landmark stream windows. We run a comprehensive set of experiments, using both real and synthetic click streams, to evaluate the performance of the proposed solution. The results demonstrate that the proposed solution yields extremely low error rates.}, biburl = {http://www.bibsonomy.org/bibtex/2f6b44ae67e9d3960e3a1bb7fe630ea5f/jhammerb}, keywords = {duplicate datamining bloomfilter algorithms data_structures} } @article{journals/tois/ChowdhuryFGM02, title = {Collection statistics for fast duplicate document detection.}, author = {Abdur Chowdhury and Ophir Frieder and David A. Grossman and M. Catherine McCabe}, journal = {ACM Trans. Inf. Syst.}, number = 2, pages = {171-191}, volume = 20, year = 2002, url = {http://dblp.uni-trier.de/db/journals/tois/tois20.html#ChowdhuryFGM02}, ee = {http://doi.acm.org/10.1145/506309.506311}, date = {2003-11-25}, description = {dblp}, biburl = {http://www.bibsonomy.org/bibtex/224249e2a7b8b809050f9083fc75d3c18/hotho}, keywords = {document detection duplicate toread} } @misc{citeulike:248914, title = {Adaptive duplicate detection using learnable string similarity measures}, author = {M. Bilenko and R. Mooney}, year = 2003, url = {http://citeseer.ist.psu.edu/bilenko03adaptive.html}, id = {248914}, priority = {0}, abstract = {The problem of identifying approximately duplicate records in databases is an essential step for data cleaning and data integration processes. Most existing approaches have relied on generic or manually tuned distance metrics for estimating the similarity of potential duplicates. In this paper, we present a framework for improving duplicate detection using trainable measures of textual similarity. We propose to employ learnable text distance functions for each database field, and show that such ...}, biburl = {http://www.bibsonomy.org/bibtex/2af65d4136267ce85a045c00122da3667/wnpxrz}, keywords = {detection measure duplicate string similarity} } @article{journals/cn/BroderGMZ97, title = {Syntactic Clustering of the Web.}, author = {Andrei Z. Broder and Steven C. Glassman and Mark S. Manasse and Geoffrey Zweig}, journal = {Computer Networks}, number = {8-13}, pages = {1157-1166}, volume = 29, year = 1997, url = {http://dblp.uni-trier.de/db/journals/cn/cn29.html#BroderGMZ97}, ee = {http://dx.doi.org/10.1016/S0169-7552(97)00031-7}, date = {2003-11-27}, description = {dblp}, biburl = {http://www.bibsonomy.org/bibtex/2b88a36c088beef971845324c862599d0/hotho}, keywords = {detection duplicate toread} } @inproceedings{bilenko03, title = {Adaptive Duplicate Detection Using Learnable String Similarity Measures}, address = {Washington, DC}, author = {Mikhail Bilenko and Raymond J. Mooney}, booktitle = {Proceedings of the Ninth ACM SIGKDD International}, year = 2003, location = {Wachington, DC}, biburl = {http://www.bibsonomy.org/bibtex/2583fc29d1c996be432f57f404f1963f8/sam_chapman}, keywords = {detection duplicate string similarity learning machine} } @inproceedings{bilenko03, title = {On Evaluation and Training-Set Construction for Duplicate Detection}, address = {Washington, DC}, author = {Mikhail Bilenko and Raymond J. Mooney}, booktitle = {Proceedings of the KDD-2003 Workshop on Data}, pages = {7--12}, year = 2003, location = {Seattle, WA}, biburl = {http://www.bibsonomy.org/bibtex/29e3213fa6a72427ed9ee6c3a6933f406/sam_chapman}, keywords = {training detection standard duplicate set RIDDLE gold construction} }