@inproceedings{Broder1997, abstract = {Given two documents A and B we define two mathematical notions: their resemblance r(A, B) and their containment c(A, B) that seem to capture well the informal notions of “roughly the same� and “roughly contained.� The basic idea is to reduce these issues to set intersection problems that can be easily evaluated by a process of random sampling that can be done independently for each document. Furthermore, the resemblance can be evaluated using a fixed size sample for each document. This paper discusses the mathematical properties of these measures and the efficient implementation of the sampling process using Rabin (1981) fingerprints}, added-at = {2011-07-07T11:07:43.000+0200}, address = {Salerno, Italy}, author = {Broder, Andrei Z.}, biburl = {http://www.bibsonomy.org/bibtex/2e8d7e47dafc145c54846bb69e1c1be39/stroeh}, booktitle = {Compression and Complexity of Sequences}, citeulike-article-id = {562668}, description = {Not previously uploaded}, interhash = {3e9b05638c537f23a276ef4e09d4b9d4}, intrahash = {e8d7e47dafc145c54846bb69e1c1be39}, keywords = {detection duplicate resemblance}, month = {June}, pages = {21--29}, priority = {3}, publisher = {IEEE Computer Society Press}, timestamp = {2011-07-07T11:07:43.000+0200}, title = {On the resemblance and containment of documents}, url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.779&rep=rep1&type=pdf}, year = 1997 } @inproceedings{broder1997, added-at = {2009-08-19T01:22:38.000+0200}, address = {Washington, DC, USA}, author = {Broder, A.}, biburl = {http://www.bibsonomy.org/bibtex/278b3f3faced79adfcda4e3a57f7e57ff/mstrohm}, booktitle = {SEQUENCES '97: Proceedings of the Compression and Complexity of Sequences 1997}, description = {on shingles}, interhash = {3e9b05638c537f23a276ef4e09d4b9d4}, intrahash = {78b3f3faced79adfcda4e3a57f7e57ff}, keywords = {INFLUENTIAL information-retrieval similarity}, pages = 21, publisher = {IEEE Computer Society}, timestamp = {2009-08-19T01:22:38.000+0200}, title = {On the Resemblance and Containment of Documents}, year = 1997 } @inproceedings{broder97, abstract = {Given two documents A and B we define two mathematical notions: their resemblance r(A, B) and their containment c(A, B) that seem to capture well the informal notions of “roughly the same� and “roughly contained.� The basic idea is to reduce these issues to set intersection problems that can be easily evaluated by a process of random sampling that can be done independently for each document. Furthermore, the resemblance can be evaluated using a fixed size sample for each document. This paper discusses the mathematical properties of these measures and the efficient implementation of the sampling process using Rabin (1981) fingerprints}, added-at = {2006-09-25T06:32:37.000+0200}, address = {Salerno, Italy}, author = {Broder, A. Z.}, biburl = {http://www.bibsonomy.org/bibtex/22948189a910501dfdb86469a3e13505a/neilernst}, booktitle = {Compression and Complexity of Sequences}, citeulike-article-id = {562668}, description = {Not previously uploaded}, interhash = {3e9b05638c537f23a276ef4e09d4b9d4}, intrahash = {2948189a910501dfdb86469a3e13505a}, keywords = {shingles database}, month = {June}, pages = {21--29}, priority = {3}, publisher = {IEEE Computer Society Press}, timestamp = {2006-09-25T06:32:37.000+0200}, title = {On the resemblance and containment of documents}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=666900}, year = 1997 }