We propose a new unsupervised learning technique for extracting information about authors
and topics from large text collections. We model documents as if they were generated by
a two-stage stochastic process. An author is represented by a probability distribution over
topics, and each topic is represented as a probability distribution over words. The probability
distribution over topics in a multi-author paper is a mixture of the distributions associated
with the authors. The topic-word and author-topic distributions are learned from data in an
unsupervised manner using a Markov chain Monte Carlo algorithm. We apply the methodology
to three large text corpora: 150,000 abstracts from the CiteSeer digital library, 1,740 papers
from the Neural Information Processing Systems Conference (NIPS), and 121,000 emails from a
large corporation. We discuss in detail the interpretation of the results discovered by the system
including specific topic and author models, ranking of authors by topic and topics by author,
parsing of abstracts by topics and authors, and detection of unusual papers by specific authors.
Experiments based on perplexity scores for test documents are used to illustrate systematic
differences between the proposed author topic model and a number of alternatives. Extensions
to the model, allowing (for example) generalizations of the notion of an author, are also briefly
discussed.
%0 Report
%1 citeulike:531120
%A Rosen-Zvi, Michal
%A Griffiths, Thomas
%A Smyth, Padhraic
%A Steyvers, Mark
%D 2005
%K topicinference aaa
%T Learning Author Topic Models from Text Corpora
%U http://www.ics.uci.edu/~smyth/kddpapers/UCI_KD-D_author_topic_preprint.pdf
%X We propose a new unsupervised learning technique for extracting information about authors
and topics from large text collections. We model documents as if they were generated by
a two-stage stochastic process. An author is represented by a probability distribution over
topics, and each topic is represented as a probability distribution over words. The probability
distribution over topics in a multi-author paper is a mixture of the distributions associated
with the authors. The topic-word and author-topic distributions are learned from data in an
unsupervised manner using a Markov chain Monte Carlo algorithm. We apply the methodology
to three large text corpora: 150,000 abstracts from the CiteSeer digital library, 1,740 papers
from the Neural Information Processing Systems Conference (NIPS), and 121,000 emails from a
large corporation. We discuss in detail the interpretation of the results discovered by the system
including specific topic and author models, ranking of authors by topic and topics by author,
parsing of abstracts by topics and authors, and detection of unusual papers by specific authors.
Experiments based on perplexity scores for test documents are used to illustrate systematic
differences between the proposed author topic model and a number of alternatives. Extensions
to the model, allowing (for example) generalizations of the notion of an author, are also briefly
discussed.
@techreport{citeulike:531120,
abstract = {We propose a new unsupervised learning technique for extracting information about authors
and topics from large text collections. We model documents as if they were generated by
a two-stage stochastic process. An author is represented by a probability distribution over
topics, and each topic is represented as a probability distribution over words. The probability
distribution over topics in a multi-author paper is a mixture of the distributions associated
with the authors. The topic-word and author-topic distributions are learned from data in an
unsupervised manner using a Markov chain Monte Carlo algorithm. We apply the methodology
to three large text corpora: 150,000 abstracts from the CiteSeer digital library, 1,740 papers
from the Neural Information Processing Systems Conference (NIPS), and 121,000 emails from a
large corporation. We discuss in detail the interpretation of the results discovered by the system
including specific topic and author models, ranking of authors by topic and topics by author,
parsing of abstracts by topics and authors, and detection of unusual papers by specific authors.
Experiments based on perplexity scores for test documents are used to illustrate systematic
differences between the proposed author topic model and a number of alternatives. Extensions
to the model, allowing (for example) generalizations of the notion of an author, are also briefly
discussed.},
added-at = {2006-06-16T10:34:37.000+0200},
author = {Rosen-Zvi, Michal and Griffiths, Thomas and Smyth, Padhraic and Steyvers, Mark},
biburl = {https://www.bibsonomy.org/bibtex/2b6aa57c5793b8e41cc485c78d85d05d4/ldietz},
citeulike-article-id = {531120},
comment = {Full Version.
The material in this paper was presented in part at the 2004 Uncertainty in AI Conference and the 2004 ACM
SIGKDD Conference.},
interhash = {debfc8f6f0964c5aefd924086d049099},
intrahash = {b6aa57c5793b8e41cc485c78d85d05d4},
keywords = {topicinference aaa},
month = {November},
priority = {0},
timestamp = {2006-06-16T10:34:37.000+0200},
title = {Learning Author Topic Models from Text Corpora},
url = {http://www.ics.uci.edu/~smyth/kddpapers/UCI_KD-D_author_topic_preprint.pdf},
year = 2005
}