@inproceedings{ntoulas2006spam, title = {Detecting spam web pages through content analysis}, address = {New York, NY, USA}, author = {Alexandros Ntoulas and Marc Najork and Mark Manasse and Dennis Fetterly}, booktitle = {WWW '06: Proceedings of the 15th international conference on World Wide Web}, pages = {83--92}, publisher = {ACM}, url = {http://portal.acm.org/citation.cfm?id=1135794}, year = {2006}, biburl = {http://www.bibsonomy.org/bibtex/2c93f4228fd8552bede071569cdaa1ad9/beate}, description = {Detecting spam web pages through content analysis}, abstract = {In this paper, we continue our investigations of "web spam": the injection of artificially-created pages into the web in order to influence the results from search engines, to drive traffic to certain pages for fun or profit. This paper considers some previously-undescribed techniques for automatically detecting spam pages, examines the effectiveness of these techniques in isolation and when aggregated using classification algorithms. When combined, our heuristics correctly identify 2,037 (86.2%) of the 2,364 spam pages (13.8%) in our judged collection of 17,168 pages, while misidentifying 526 spam and non-spam pages (3.1%).}, location = {Edinburgh, Scotland}, isbn = {1-59593-323-9}, doi = {http://doi.acm.org/10.1145/1135777.1135794}, keywords = {features spam web } }