| Authors: |
Carlos Castillo
and Debora Donato
and Aristides Gionis
and Vanessa Murdock
and Fabrizio Silvestri
|
| Editors: |
Wessel Kraaij
and Arjen P. de Vries
and Charles L. A. Clarke
and Norbert Fuhr
and Noriko Kando
|
| URL: |
http://www.dcc.uchile.cl/~ccastill/papers/cdgms_2006_know_your_neighbors.pdf |
| Tags: |
detection
neighbors
spam
spamdetection
web
|
| Abstract: |
Web spam can significantly deteriorate the quality of search
engine results. Thus there is a large incentive for commercial
search engines to detect spam pages efficiently and accurately.
In this paper we present a spam detection system
that uses the topology of the Web graph by exploiting the
link dependencies among the Web pages, and the content
of the pages themselves. We find that linked hosts tend to
belong to the same class: either both are spam or both are
non-spam. We demonstrate three methods of incorporating
the Web graph topology into the predictions obtained by
our base classifier: (i) clustering the host graph, and assigning
the label of all hosts in the cluster by majority vote, (ii)
propagating the predicted labels to neighboring hosts, and
(iii) using the predicted labels of neighboring hosts as new
features and retraining the classifier. The result is an accurate
system for detecting Web spam that can be applied in
practice to large-scale Web data.
Categories and Subject Descriptors: H.4.m [Information
Systems Applications]: Miscellaneous
General Terms: Algorithms, Measurement.
Keywords: Link spam, Content spam, Web spam |
@inproceedings{CastilloDGMS07,
title = {Know your neighbors: web spam detection using the web topology.},
author = {Carlos Castillo and Debora Donato and Aristides Gionis and Vanessa Murdock and Fabrizio Silvestri},
booktitle = {SIGIR},
crossref = {conf/sigir/2007},
editor = {Wessel Kraaij and Arjen P. de Vries and Charles L. A. Clarke and Norbert Fuhr and Noriko Kando},
pages = {423-430},
publisher = {ACM},
url = {http://www.dcc.uchile.cl/~ccastill/papers/cdgms_2006_know_your_neighbors.pdf},
year = {2007},
abstract = {Web spam can significantly deteriorate the quality of search
engine results. Thus there is a large incentive for commercial
search engines to detect spam pages efficiently and accurately.
In this paper we present a spam detection system
that uses the topology of the Web graph by exploiting the
link dependencies among the Web pages, and the content
of the pages themselves. We find that linked hosts tend to
belong to the same class: either both are spam or both are
non-spam. We demonstrate three methods of incorporating
the Web graph topology into the predictions obtained by
our base classifier: (i) clustering the host graph, and assigning
the label of all hosts in the cluster by majority vote, (ii)
propagating the predicted labels to neighboring hosts, and
(iii) using the predicted labels of neighboring hosts as new
features and retraining the classifier. The result is an accurate
system for detecting Web spam that can be applied in
practice to large-scale Web data.
Categories and Subject Descriptors: H.4.m [Information
Systems Applications]: Miscellaneous
General Terms: Algorithms, Measurement.
Keywords: Link spam, Content spam, Web spam},
ee = {http://doi.acm.org/10.1145/1277741.1277814}, isbn = {978-1-59593-597-7}, date = {2007-08-24},
keywords = {detection neighbors spam spamdetection web }
}