This paper presents a new method for topic-based document segmentation, i.e., the identification of boundaries between parts of a document that bear on different topics. The method combines the use of the Probabilistic Latent Semantic Analysis (PLSA) model with the method of selecting segmentation points based on the similarity values between pairs of adjacent blocks. The use of PLSA allows for a better representation of sparse information in a text block, such as a sentence or a sequence of sentences. Furthermore, segmentation performance is improved by combining different instantiations of the same model, either using different random initializations or different numbers of latent classes. Results on commonly available data sets are significantly better than those of other state-of-the-art systems.
Description
Topic-based document segmentation with probabilistic latent semantic analysis
%0 Conference Paper
%1 Brants02
%A Brants, Thorsten
%A Chen, Francine
%A Tsochantaridis, Ioannis
%B CIKM '02: Proceedings of the eleventh international conference on Information and knowledge management
%C New York, NY, USA
%D 2002
%I ACM
%K LinearAlgebra TextSegmentation
%P 211--218
%R http://doi.acm.org/10.1145/584792.584829
%T Topic-based document segmentation with probabilistic latent semantic analysis
%U http://portal.acm.org/citation.cfm?id=584829
%X This paper presents a new method for topic-based document segmentation, i.e., the identification of boundaries between parts of a document that bear on different topics. The method combines the use of the Probabilistic Latent Semantic Analysis (PLSA) model with the method of selecting segmentation points based on the similarity values between pairs of adjacent blocks. The use of PLSA allows for a better representation of sparse information in a text block, such as a sentence or a sequence of sentences. Furthermore, segmentation performance is improved by combining different instantiations of the same model, either using different random initializations or different numbers of latent classes. Results on commonly available data sets are significantly better than those of other state-of-the-art systems.
%@ 1-58113-492-4
@inproceedings{Brants02,
abstract = {This paper presents a new method for topic-based document segmentation, i.e., the identification of boundaries between parts of a document that bear on different topics. The method combines the use of the Probabilistic Latent Semantic Analysis (PLSA) model with the method of selecting segmentation points based on the similarity values between pairs of adjacent blocks. The use of PLSA allows for a better representation of sparse information in a text block, such as a sentence or a sequence of sentences. Furthermore, segmentation performance is improved by combining different instantiations of the same model, either using different random initializations or different numbers of latent classes. Results on commonly available data sets are significantly better than those of other state-of-the-art systems.},
added-at = {2009-03-30T19:12:48.000+0200},
address = {New York, NY, USA},
author = {Brants, Thorsten and Chen, Francine and Tsochantaridis, Ioannis},
biburl = {https://www.bibsonomy.org/bibtex/2983e3f0107538e1bd9d71f60db7b2067/mkroell},
booktitle = {CIKM '02: Proceedings of the eleventh international conference on Information and knowledge management},
description = {Topic-based document segmentation with probabilistic latent semantic analysis},
doi = {http://doi.acm.org/10.1145/584792.584829},
interhash = {03b355dd3d9e2c69cd2eae3deaf51f1c},
intrahash = {983e3f0107538e1bd9d71f60db7b2067},
isbn = {1-58113-492-4},
keywords = {LinearAlgebra TextSegmentation},
location = {McLean, Virginia, USA},
pages = {211--218},
publisher = {ACM},
timestamp = {2009-03-31T10:28:19.000+0200},
title = {Topic-based document segmentation with probabilistic latent semantic analysis},
url = {http://portal.acm.org/citation.cfm?id=584829},
year = 2002
}