This paper focuses on spam blog (splog) detection. Blogs are
highly popular, new media social communication mechanisms.
The presence of splogs degrades blog search results as well as
wastes network resources. In our approach we exploit unique blog
temporal dynamics to detect splogs.
There are three key ideas in our splog detection framework. We
first represent the blog temporal dynamics using self-similarity
matrices defined on the histogram intersection similarity measure
of the time, content, and link attributes of posts. Second, we show
via a novel visualization that the blog temporal characteristics
reveal attribute correlation, depending on type of the blog (normal
blogs and splogs). Third, we propose the use of temporal
structural properties computed from self-similarity matrices across
different attributes. In a splog detector, these novel features are
combined with content based features. We extract a content based
feature vector from different parts of the blog – URLs, post
content, etc. The dimensionality of the feature vector is reduced
by Fisher linear discriminant analysis. We have tested an SVM
based splog detector using proposed features on real world
datasets, with excellent results (90\% accuracy).
%0 Conference Paper
%1 lin_2007_splogtime
%A Lin, Yu-Ru
%A Sundaram, Hari
%A Chi, Yun
%A Tatemura, Junichi
%A Tseng, Belle L.
%B AIRWeb '07: Proceedings of the 3rd international workshop on Adversarial information retrieval on the web
%C New York, NY, USA
%D 2007
%I ACM Press
%K adversarial-ir, blogs, time
%P 1--8
%R 10.1145/1244408.1244410
%T Splog detection using self-similarity analysis on blog temporal dynamics
%U http://dx.doi.org/10.1145/1244408.1244410
%X This paper focuses on spam blog (splog) detection. Blogs are
highly popular, new media social communication mechanisms.
The presence of splogs degrades blog search results as well as
wastes network resources. In our approach we exploit unique blog
temporal dynamics to detect splogs.
There are three key ideas in our splog detection framework. We
first represent the blog temporal dynamics using self-similarity
matrices defined on the histogram intersection similarity measure
of the time, content, and link attributes of posts. Second, we show
via a novel visualization that the blog temporal characteristics
reveal attribute correlation, depending on type of the blog (normal
blogs and splogs). Third, we propose the use of temporal
structural properties computed from self-similarity matrices across
different attributes. In a splog detector, these novel features are
combined with content based features. We extract a content based
feature vector from different parts of the blog – URLs, post
content, etc. The dimensionality of the feature vector is reduced
by Fisher linear discriminant analysis. We have tested an SVM
based splog detector using proposed features on real world
datasets, with excellent results (90\% accuracy).
%@ 9781595937322
@inproceedings{lin_2007_splogtime,
abstract = {This paper focuses on spam blog (splog) detection. Blogs are
highly popular, new media social communication mechanisms.
The presence of splogs degrades blog search results as well as
wastes network resources. In our approach we exploit unique blog
temporal dynamics to detect splogs.
There are three key ideas in our splog detection framework. We
first represent the blog temporal dynamics using self-similarity
matrices defined on the histogram intersection similarity measure
of the time, content, and link attributes of posts. Second, we show
via a novel visualization that the blog temporal characteristics
reveal attribute correlation, depending on type of the blog (normal
blogs and splogs). Third, we propose the use of temporal
structural properties computed from self-similarity matrices across
different attributes. In a splog detector, these novel features are
combined with content based features. We extract a content based
feature vector from different parts of the blog – URLs, post
content, etc. The dimensionality of the feature vector is reduced
by Fisher linear discriminant analysis. We have tested an SVM
based splog detector using proposed features on real world
datasets, with excellent results (90\% accuracy).},
added-at = {2009-08-06T15:16:38.000+0200},
address = {New York, NY, USA},
author = {Lin, Yu-Ru and Sundaram, Hari and Chi, Yun and Tatemura, Junichi and Tseng, Belle L.},
biburl = {https://www.bibsonomy.org/bibtex/218bcefb89ec656b7cfe37c9f9edf51dc/chato},
booktitle = {AIRWeb '07: Proceedings of the 3rd international workshop on Adversarial information retrieval on the web},
citeulike-article-id = {1459446},
citeulike-linkout-0 = {http://portal.acm.org/citation.cfm?id=1244410},
citeulike-linkout-1 = {http://dx.doi.org/10.1145/1244408.1244410},
doi = {10.1145/1244408.1244410},
interhash = {9a463e2b21e2c305db7df9f72aa78201},
intrahash = {18bcefb89ec656b7cfe37c9f9edf51dc},
isbn = {9781595937322},
keywords = {adversarial-ir, blogs, time},
pages = {1--8},
posted-at = {2009-01-07 10:42:06},
priority = {0},
publisher = {ACM Press},
timestamp = {2009-08-06T15:16:43.000+0200},
title = {Splog detection using self-similarity analysis on blog temporal dynamics},
url = {http://dx.doi.org/10.1145/1244408.1244410},
year = 2007
}