We present an approach to training a binary logistic regression classifier in
the setting where the training data needs to be kept private. We provide a
theoretical analysis of the security of this procedure and experimental results
for the problem of large scale spam detection. High performance spam filters
often use character n-grams as features which result in large sparse vectors to
which applying our protocol directly is not feasible. We explore various
dimensionality reduction and parallelization approaches and provide a detailed
analysis of the speed and accuracy trade-off. Our results show that we can
achieve the accuracy of state of the art spam filters at comparable training
and testing time of non-private version of logistic regression.
%0 Generic
%1 pathak2011privacy
%A Pathak, Manas A.
%A Sharifi, Mehrbod
%A Raj, Bhiksha
%D 2011
%K info2.0 privacy spam-detection
%T Privacy Preserving Spam Filtering
%U http://arxiv.org/abs/1102.4021
%X We present an approach to training a binary logistic regression classifier in
the setting where the training data needs to be kept private. We provide a
theoretical analysis of the security of this procedure and experimental results
for the problem of large scale spam detection. High performance spam filters
often use character n-grams as features which result in large sparse vectors to
which applying our protocol directly is not feasible. We explore various
dimensionality reduction and parallelization approaches and provide a detailed
analysis of the speed and accuracy trade-off. Our results show that we can
achieve the accuracy of state of the art spam filters at comparable training
and testing time of non-private version of logistic regression.
@misc{pathak2011privacy,
abstract = { We present an approach to training a binary logistic regression classifier in
the setting where the training data needs to be kept private. We provide a
theoretical analysis of the security of this procedure and experimental results
for the problem of large scale spam detection. High performance spam filters
often use character n-grams as features which result in large sparse vectors to
which applying our protocol directly is not feasible. We explore various
dimensionality reduction and parallelization approaches and provide a detailed
analysis of the speed and accuracy trade-off. Our results show that we can
achieve the accuracy of state of the art spam filters at comparable training
and testing time of non-private version of logistic regression.
},
added-at = {2011-05-04T17:17:33.000+0200},
author = {Pathak, Manas A. and Sharifi, Mehrbod and Raj, Bhiksha},
biburl = {https://www.bibsonomy.org/bibtex/2734857d94b8989d6df811e9875c0236b/beate},
description = {[1102.4021] Privacy Preserving Spam Filtering},
interhash = {1765f0135ec52d15f6dcf190d223278d},
intrahash = {734857d94b8989d6df811e9875c0236b},
keywords = {info2.0 privacy spam-detection},
note = {cite arxiv:1102.4021
Comment: 9 pages},
timestamp = {2011-05-04T17:17:33.000+0200},
title = {Privacy Preserving Spam Filtering},
url = {http://arxiv.org/abs/1102.4021},
year = 2011
}