@inproceedings{joachims1997rocchio, title = {A Probabilistic Analysis of the Rocchio Algorithm with TFIDF for Text Categorization.}, author = {Thorsten Joachims}, booktitle = {ICML}, crossref = {conf/icml/1997}, editor = {Douglas H. Fisher}, pages = {143-151}, publisher = {Morgan Kaufmann}, year = 1997, url = {http://dblp.uni-trier.de/db/conf/icml/icml1997.html#Joachims97}, isbn = {1-55860-486-3}, date = {2002-12-04}, biburl = {http://www.bibsonomy.org/bibtex/2dfa99d567392038673882c932153053c/jil}, keywords = {bayes estimator laplace probabilistic rocchio tfidf} } @inproceedings{han2000rocchio, title = {Centroid-Based Document Classification: Analysis and Experimental Results.}, author = {Eui-Hong Han and George Karypis}, booktitle = {PKDD}, crossref = {conf/pkdd/2000}, editor = {Djamel A. Zighed and Henryk Jan Komorowski and Jan M. Zytkow}, pages = {424-431}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, volume = 1910, year = 2000, url = {http://glaros.dtc.umn.edu/gkhome/fetch/papers/centroidPKDD00.pdf}, ee = {http://link.springer.de/link/service/series/0558/bibs/1910/19100424.htm}, isbn = {3-540-41066-X}, date = {2002-07-22}, biburl = {http://www.bibsonomy.org/bibtex/2e46f97a70e986c33b1822d6a247dd1a5/jil}, keywords = {average classification classifier cos cosinus interpretation klassifikation learning loose machine rocchio similarity simple tight} } @misc{kim2002naive, title = {Effective methods for improving Naive Bayes text classifiers}, author = {S. Kim and H. Rim and D. Yook and H. Lim}, year = 2002, url = {http://citeseer.ist.psu.edu/kim02effective.html}, biburl = {http://www.bibsonomy.org/bibtex/2b8f819dc681e76ee9723c72a859dff3c/jil}, keywords = {bayes learning length machine multinomial naive normalization} } @mastersthesis{rennie2001naive, title = {Improving Multi-class Text Classification with Naive Bayes}, author = {Jason D. M. Rennie}, school = {Massachusetts Institute of Technology}, year = 2001, url = {http://people.csail.mit.edu/~jrennie/papers/sm-thesis.pdf}, abstract = {There are numerous text documents available in electronic form. More and more are becoming available every day. Such documents represent a massive amount of information that is easily accessible. Seeking value in this huge collection requires organization; much of the work of organizing documents can be automated through text classification. The accuracy and our understanding of such systems greatly influences their usefulness. In this paper, we seek 1) to advance the understanding of commonly used text classification techniques, and 2) through that understanding, improve the tools that are available for text classification. We begin by clarifying the assumptions made in the derivation of Naive Bayes, noting basic properties and proposing ways for its extension and improvement. Next, we investigate the quality of Naive Bayes parameter estimates and their impact on classification. Our analysis leads to a theorem which gives an explanation for the improvements that can be found in multiclass classification with Naive Bayes using Error-Correcting Output Codes. We use experimental evidence on two commonly-used data sets to exhibit an application of the theorem. Finally, we show fundamental flaws in a commonly-used feature selection algorithm and develop a statistics-based framework for text feature selection. Greater understanding of Naive Bayes and the properties of text allows us to make better use of it in text classification.}, biburl = {http://www.bibsonomy.org/bibtex/22896eb9538a6ee34f8e6c6757bdcf99e/jil}, keywords = {bayes deduction estimation exhaustive herleitung komplett likelihood map maximum mle multinomial naive prior thesis} } @inproceedings{mccallum1998naive, title = {A Comparison of Event Models for Naive {B}ayes Text Classification}, author = {Andrew McCallum and Kamal Nigam}, booktitle = {Learning for Text Categorization: Papers from the 1998 {AAAI} Workshop }, pages = {41--48}, year = 1998, url = {http://www.kamalnigam.com/papers/multinomial-aaaiws98.pdf}, biburl = {http://www.bibsonomy.org/bibtex/2fa46d1cc0dd56ab40a7f722e569a1fd3/jil}, keywords = {bayes bernoulli classification ereignis event model multinomial naive text vergleich} } @inproceedings{lewis1998naive, title = {Naive ({B}ayes) at forty: The independence assumption in information retrieval.}, address = {Chemnitz, DE}, author = {David D. Lewis}, booktitle = {Proceedings of {ECML}-98, 10th European Conference on Machine Learning}, editor = {Claire N{\'{e}}dellec and C{\'{e}}line Rouveirol}, number = 1398, pages = {4--15}, publisher = {Springer Verlag, Heidelberg, DE}, year = 1998, url = {http://citeseer.ist.psu.edu/lewis98naive.html}, biburl = {http://www.bibsonomy.org/bibtex/2e290abb350b7aa09a412c1dddac55cd6/jil}, keywords = {bayes forty ir naive overview representation text} } @misc{metsis2006naive, title = {Spam Filtering with Naive Bayes -- Which Naive Bayes?}, author = {Vangelis Metsis and Ion Androutsopoulos and Georgios Paliouras}, year = 2006, url = {http://citeseer.ist.psu.edu/757874.html}, biburl = {http://www.bibsonomy.org/bibtex/2b4e1a9d4635a9fb1f11a947f1ab3618a/jil}, keywords = {bayes metsis multinomial multivariate naive spam} } @techreport{lewis2004tutorial, title = {A Short SVM (Support Vector Machine) Tutorial}, author = {J.P. Lewis}, institution = {CGIT Lab / IMSC}, year = 2004, url = {http://www.idiom.com/~zilla/Work/Notes/svmtutorial.pdf}, biburl = {http://www.bibsonomy.org/bibtex/2b7cf853e8635bd2887e8dea3d9e10ccb/jil}, keywords = {background kkt lagrange math mathe mathematik svm tutorial} } @article{burges1998, title = {A Tutorial on Support Vector Machines for Pattern Recognition}, author = {Christopher J. C. Burges}, journal = {Data Mining and Knowledge Discovery}, number = 2, pages = {121-167}, volume = 2, year = 1998, url = {citeseer.ist.psu.edu/burges98tutorial.html}, biburl = {http://www.bibsonomy.org/bibtex/2ad2a33b52e690eaf15da04fff7f12755/jil}, keywords = {burges deduction herleitung kkt lagrange svm tutorial} } @inproceedings{joachims1999, title = {Transductive Inference for Text Classification using Support Vector Machines}, address = {Bled, SL}, author = {Thorsten Joachims}, booktitle = {Proceedings of {ICML}-99, 16th International Conference on Machine Learning}, editor = {Ivan Bratko and Saso Dzeroski}, pages = {200--209}, publisher = {Morgan Kaufmann Publishers, San Francisco, US}, year = 1999, url = {http://www.joachims.org/publications/joachims_99c.ps.gz}, lastdatemodified = {2005-08-06}, pdf = {joachims99.pdf}, read = {notread}, lastname = {Joachims}, own = {own}, abstract = {This paper introduces Transductive Support Vector Machines (TSVMs) for text classifi­ cation. While regular Support Vector Ma­ chines (SVMs) try to induce a general deci­ sion function for a learning task, Transduc­ tive Support Vector Machines take into ac­ count a particular test set and try to mini­ mize misclassifications of just those particu­ lar examples. The paper presents an anal­ ysis of why TSVMs are well suited for text classification. These theoretical findings are supported by experiments on three test col­ lections. The experiments show substantial improvements over inductive methods, espe­ cially for small training sets, cutting the num­ ber of labeled training examples down to a twentieth on some tasks. This work also pro­ poses an algorithm for training TSVMs effi­ ciently, handling 10,000 examples and more.}, biburl = {http://www.bibsonomy.org/bibtex/27cf3e7981cac898c1745418db83e0fd6/jil}, keywords = {svm svmlight transductive} }