@inproceedings{valsan03clustering, title = {Thematic text clustering for domain specific language model adaptation}, author = {Z. Valsan and M. Emele}, booktitle = {Automatic Speech Recognition and Understanding, 2003. ASRU '03. 2003 IEEE Workshop on}, pages = {513- 518}, year = 2003, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1318493}, isbn = {0-7803-7980-2}, doi = {10.1109/ASRU.2003.1318493}, abstract = {We propose a new approach for thematic text clustering. The text clusters are used to generate domain specific language models in order to address the problem of language model adaptation. The method relies on a new discriminative n-gram based term selection process (n>l), which reduces the influence of the corpus inhomogeneity, and outputs only semantically focused n-grams as being the most representative key terms in the corpus. These key terms are then used to automatically cluster the whole document collection and generate LM out of these text clusters. Different key term selection methods are evaluated using perplexity as a measure. Automatically computed clusters are compared with manually assigned labelling according to genre information. The results of these experimental studies are presented and discussed. Compared to the manual clustering a significant performance improvement between 21.87 % and 53.12 % is observed depending on the chosen key term selection method.}, biburl = {http://www.bibsonomy.org/bibtex/25db7751ea4549230d191e6c34edcc79b/msn}, keywords = {mrefs research.clustering state.printed state.toRead research.nlp research.kr.domain} } @inproceedings{sander03extraction, title = {Automatic Extraction of Clusters from Hierarchical Clustering Representations.}, author = {J{\"o}rg Sander and Xuejie Qin and Zhiyong Lu and Nan Niu and Alex Kovarsky}, booktitle = {Proc. of 7th Pacific-Asia Conf. of Advances in Knowledge Discovery and Data Mining, PAKDD 2003, Proceedings}, pages = {75-87}, publisher = {Springer}, year = 2003, url = {http://www.springerlink.com/content/he3wv27nyj5ldh3y/}, language = {english}, abstract = {Hierarchical clustering algorithms are typically more effective in detecting the true clustering structure of a data set than partitioning algorithms. However, hierarchical clustering algorithms do not actually create clusters, but compute only a hierarchical representation of the data set. This makes them unsuitable as an automatic pre-processing step for other algorithms that operate on detected clusters. This is true for both dendrograms and reachability plots, which have been proposed as hierarchical clustering representations, and which have different advantages and disadvantages. In this paper we first investigate the relation between dendrograms and reachability plots and introduce methods to convert them into each other showing that they essentially contain the same information. Based on reachability plots, we then introduce a technique that automatically determines the significant clusters in a hierarchical cluster representation. This makes it for the first time possible to use hierarchical clustering as an automatic pre-processing step that requires no user interaction to select clusters from a hierarchical cluster representation.}, biburl = {http://www.bibsonomy.org/bibtex/2bfcee29ca5ead9e34a81bc5d2eab89a5/msn}, keywords = {cites.procm mrefs state.toRead research.clustering} } @mastersthesis{nurminen05xml, title = {Tiedonlouhinta rakenteisista dokumenteista}, author = {Miika Nurminen}, school = {University of Jyväskylä}, year = 2005, url = {http://thesis.jyu.fi/05/URN_NBN_fi_jyu-200594.pdf}, biburl = {http://www.bibsonomy.org/bibtex/292746b7d6aeaa69e30813b3a68b8e014/msn}, keywords = {research.ir mrefs research.mining research.clustering research.xml research.papers} } @inproceedings{nurminen05extminer, title = {Ext{M}iner: Combining Multiple Ranking and Clustering Algorithms for Structured Document Retrieval}, author = {Miika Nurminen and Anne Honkaranta and Tommi K\"arkk\"ainen}, booktitle = {International workshop on Integrating Data Mining, Databases and Information Retrieval (IDDI'05), Proceedings of the 16th International Workshop on Database and Expert Systems Applications}, pages = {1036-1040}, publisher = {IEEE Computer Society}, year = 2005, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1508411}, location = {Copenhagen, Denmark}, abstract = { This paper introduces ExtMiner, a platform and potential tool for information management in SMEs (small & medium-size enterprise), or for organizational workgroups. ExtMiner supports interactive and iterative clustering of documents. It provides users with a visual cluster and list views at the same time, supporting iterative search process. ExtMiner may also be applied as a platform for research on retrieval fusion, since it combines search, clustering and visualization algorithms. ExtMiner was evaluated with three document collections. Although the findings were encouraging the user interface and performance with large document repositories need further development.}, biburl = {http://www.bibsonomy.org/bibtex/23ec379c314de9661680ae26e06d59d88/msn}, keywords = {research.clustering research.xml mrefs cites.procm cites.dss.r research.ir.ranking research.mining research.ir research.papers} } @proceedings{yang05learning, title = {Learning the Kernel Matrix for XML Document Clustering}, author = {Jianwu Yang and W. Cheung and Xiaoou Chen}, journal = {e-Technology, e-Commerce and e-Service, 2005. EEE '05. Proceedings. The 2005 IEEE International Conference on}, pages = {353--358}, year = 2005, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1402321}, id = {201627}, priority = {2}, abstract = {The rapid growth of XML adoption has urged for the need of a proper representation for semi-structured documents, where the document structural information has to be taken into account so as to support more precise document analysis. In this paper, an XML document representation named "structured link vector model" is adopted, with a kernel matrix included for modeling the similarity between XML elements. Our formulation allows individual XML elements to have their own weighted contribution to the overall document similarity while at the same time allows the between-element similarity to be captured. An iterative algorithm is derived to learn the kernel matrix. For performance evaluation, the ACM SIGMOD Record dataset as well as the CEDB dataset have been tested. Our proposed method outperforms significantly the traditional vector space model and the edit-distance based methods. In addition, the kernel matrix obtained as a by-product provides knowledge about the conceptual relationship between the XML elements.}, biburl = {http://www.bibsonomy.org/bibtex/23fee4424f902a0d0385e744b075ffd05/msn}, keywords = {research.xml state.toRead mrefs research.clustering} } @article{kashyap05taxaminer, title = {TaxaMiner: an experimentation framework for automated taxonomy bootstrapping}, author = {Vipul Kashyap and Cartic Ramakrishnan and Christopher Thomas and A. Sheth}, journal = {International Journal of Web and Grid Services}, number = 2, pages = {240--266}, volume = 1, year = 2005, url = {http://www.inderscience.com/search/index.php?action=record\&rec_id=8322}, id = {593431}, priority = {2}, doi = {10.1504/IJWGS.2005.008322}, abstract = {Construction of domain ontologies on the semantic web is a human and resource intensive process, efforts to reduce which are crucial for the Semantic Web to scale. We present a framework for automated taxonomy construction, that involves: (a) generation of a cluster hierarchy from a document corpus using statistical clustering and NLP techniques; (b) extraction of a topic hierarchy from this cluster hierarchy; and (c) assignment of labels to nodes in the topic hierarchy. Metrics for estimating topic hierarchy quality and parameters of an experimentation framework are identified. MEDLINE was the document corpus and MeSH thesaurus was the gold standard.}, biburl = {http://www.bibsonomy.org/bibtex/2ea883cf5f9761fd99fb252e8b0f7330b/msn}, keywords = {research.conceptual.generation research.clustering mrefs research.kr.ontologies} } @inproceedings{Huang:2007, title = {A Fast Algorithm for Balanced Graph Clustering}, author = {Mao Lin Huang and Quang Vinh Nguyen}, booktitle = {Information Visualization, 2007. IV '07. 11th International Conference}, pages = {46-52}, year = 2007, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?tp=&arnumber=4271960&isnumber=4271944}, issn = {1550-6037}, isbn = {0-7695-2900-3}, doi = {10.1109/IV.2007.10}, abstract = {Scalability problem is a long-lasting challenge for both information visualization and graph drawing communities. Available graph visualization techniques could perform well for small or medium size graphs but they are rarely able to handle very large and complex graphs. One of effective approach to solve this problem is to employ graph abstraction; that is to hierarchically partitioning the complete graph into a clustered graph. A graph visualization technique is then applied to display the abstract view of this clustered graph with partially displayed detail of one or a few sub-graphs where the user is currently focusing on. This reduces the complexity of display and makes it easier for users to interpret, perceive and navigate the large scale information. In this paper, we propose a graph clustering method which can quickly discover the community structure embedded in large graphs and partition the graph into densely connected sub-graphs. The proposed algorithm can not only run fast, but also achieve a consistent partitioning result in which a graph is divided into a set of clusters of the similar size in terms of their visual complexity and the number of nodes and edges. In addition, we also provide a mechanism to partition very dense graphs in which the number of edges is much larger than the number of nodes.}, biburl = {http://www.bibsonomy.org/bibtex/26b15824d37477cde9c2ffc6c59392ba5/msn}, keywords = {research.clustering research.ir.visualization mrefs research.conceptual.graphs} } @inproceedings{brooks06-improved, title = {Improved annotation of the blogosphere via autotagging and hierarchical clustering}, address = {New York, NY, USA}, author = {Christopher H. Brooks and Nancy Montanez}, booktitle = {WWW '06: Proceedings of the 15th international conference on World Wide Web}, pages = {625--632}, publisher = {ACM Press}, year = 2006, url = {http://www2006.org/programme/item.php?id=583}, lastdatemodified = {2006-07-18}, longnotes = {[[http://www2006.org/programme/files/pdf/583-slides.pdf slides]] Summary: - authors analyse the effectiveness of tags for classifying blog articles (technorati) - clustering of articles beloning to top 350 technorati tags * by tag * randomly * by related by Google News - results: * tags help to classify articles into broad categories (yet Google News performs better) * tags are not that descriptive for a specific topic of an article * automatically extracted tags (by TF/IDF) are much more descriptive for specific content - 2nd study: hierarchical clustering of articles (starting from tag clusters, i.e. all articles who share a tag) - resulting tag hierarchy comes close to e.g. Yahoo hand-built one}, pdf = {brooks06-improved.pdf}, read = {read}, lastname = {Brooks}, own = {own}, abstract = {Tags have recently become popular as a means of annotating and organizing Web pages and blog entries. Advocates of tagging argue that the use of tags produces a 'folksonomy', a system in which the meaning of a tag is determined by its use among the community as a whole. We analyze the effectiveness of tags for classifying blog entries by gathering the top 350 tags from Technorati and measuring the similarity of all articles that share a tag. We find that tags are useful for grouping articles into broad categories, but less effective in indicating the particular content of an article. We then show that automatically extracting words deemed to be highly relevant can produce a more focused categorization of articles. We also show that clustering algorithms can be used to reconstruct a topical hierarchy among tags, and suggest that these approaches may be used to address some of the weaknesses in current tagging systems.}, biburl = {http://www.bibsonomy.org/bibtex/25c9c83e89da2faa8906a5927fe7ca3ef/msn}, keywords = {mrefs research.clustering research.web20.tagging} } @misc{citeulike:361498, title = {Folksonomy as a Complex Network}, author = {Kaikai Shen and Lide Wu}, month = {Sep}, year = 2005, url = {http://arxiv.org/abs/cs.IR/0509072}, id = {361498}, priority = {2}, eprint = {cs.IR/0509072}, abstract = {Folksonomy is an emerging technology that works to classify the information over WWW through tagging the bookmarks, photos or other web-based contents. It is understood to be organized by every user while not limited to the authors of the contents and the professional editors. This study surveyed the folksonomy as a complex network. The result indicates that the network, which is composed of the tags from the folksonomy, displays both properties of small world and scale-free. However, the statistics only shows a local and static slice of the vast body of folksonomy which is still evolving.}, biburl = {http://www.bibsonomy.org/bibtex/20a5d7dfd17c6952fe7a07f7756098601/msn}, keywords = {mrefs state.toRead research.conceptual.folksonomy research.clustering} } @techreport{berkhin02survey, title = {Survey Of Clustering Data Mining Techniques}, address = {San Jose, CA}, author = {Pavel Berkhin}, institution = {Accrue Software}, year = 2002, url = {http://citeseer.nj.nec.com/berkhin02survey.html}, biburl = {http://www.bibsonomy.org/bibtex/22fec60df240f69dbf677e34825d20491/msn}, keywords = {mrefs research.clustering cites.gradu} } @article{zhong03framework, title = {A unified framework for model-based clustering}, author = {Shi Zhong and Joydeep Ghosh}, journal = {Journal of Machine Learning Research}, pages = {1001-1037}, publisher = {MIT Press}, volume = 4, year = 2003, url = {http://www.ai.mit.edu/projects/jmlr//papers/v4/zhong03a.html}, language = {english}, biburl = {http://www.bibsonomy.org/bibtex/206ce8819ccec49348fd604943334ad7f/msn}, keywords = {mrefs research.clustering state.printed cites.gradu} } @article{wu01clustering, title = {Using clustering and classification approaches in interactive retrieval}, author = {Mingfang Wu and Michael Fuller and Ross Wilkinson}, journal = {Information Processing \&{} Management}, number = 3, pages = {459-484}, publisher = {Elsevier}, volume = 37, year = 2001, url = {http://dx.doi.org/10.1016/S0306-4573(00)00057-1}, language = {english}, biburl = {http://www.bibsonomy.org/bibtex/2dec422b095a5d606d179896e2782d64d/msn}, keywords = {mrefs research.ir research.clustering cites.gradu} } @inproceedings{weiss96hypursuit, title = {{H}y{P}ursuit: a hierarchical network search engine that exploits content-link hypertext clustering}, author = {Ron Weiss and Bienvenido Vélez and Mark A. Sheldon}, booktitle = {Proceedings of the the Seventh ACM Conference on Hypertext}, editor = {David Stotts}, pages = {180-193}, publisher = {ACM Press}, year = 1996, url = {http://doi.acm.org/10.1145/234828.234846}, location = {Bethesda, Maryland, United States}, pdf = {weiss96hypursuit.pdf}, language = {english}, doi = {http://doi.acm.org/10.1145/234828.234846}, biburl = {http://www.bibsonomy.org/bibtex/2e1dfb39d8481495abc692a4f4214436c/msn}, keywords = {state.printed research.clustering research.ir.fusion mrefs cites.gradu} } @phdthesis{tantrum03clustering, title = {Model Based and Hybrid Clustering of Large Datasets}, author = {Jeremy Tantrum}, school = {University of Washington}, year = 2003, url = {http://www.stat.washington.edu/tantrum/thesis.pdf}, language = {english}, biburl = {http://www.bibsonomy.org/bibtex/2df0a4d720493a157dc622b2d6ce453a5/msn}, keywords = {cites.gradu mrefs research.clustering} } @incollection{noel03clustering, title = {Document Clustering, Visualization, and Retrieval via Link Mining}, author = {Steven Noel and Vijay Raghavan and C.-H. Henry Chu}, booktitle = {Clustering and Information Retrieval}, pages = {161-194}, publisher = {Kluwer}, series = {Network Theory and Applications}, volume = 11, year = 2003, url = {http://www.isse.gmu.edu/~snoel/Clustering\%20and\%20IR\%20chapter.htm}, language = {english}, biburl = {http://www.bibsonomy.org/bibtex/27fcab8355b0abccdee5071f3367848f7/msn}, keywords = {research.ir mrefs research.conceptual.graphs research.clustering cites.gradu} } @inproceedings{modha00clustering, title = {Clustering hypertext with applications to web searching}, author = {Dharmendra S. Modha and W. Scott Spangler}, booktitle = {Proceedings of the eleventh ACM on Hypertext and hypermedia}, editor = {Frank M. {Shipman, III} and Peter J. Nürnberg and David L. Hicks}, pages = {143-152}, publisher = {ACM Press}, year = 2000, url = {http://doi.acm.org/10.1145/336296.336351}, location = {San Antonio, Texas, United States}, pdf = {modha00clustering.pdf}, language = {english}, doi = {http://doi.acm.org/10.1145/336296.336351}, biburl = {http://www.bibsonomy.org/bibtex/20c7777fdf51e7935ed3ca6d35eca9f87/msn}, keywords = {cites.gradu research.clustering reseach.ir.fusion mrefs} } @article{lian04xml, title = {An Efficient and Scalable Algorithm for Clustering {XML} Documents by Structure}, author = {Wang Lian and David Wai lok Cheung and Nikos Mamoulis and Siu-Ming Yiu}, journal = {IEEE Transactions on Knowledge and Data Engineering}, number = 1, pages = {82-96}, publisher = {IEEE Educational Activities Department}, volume = 16, year = 2004, url = {http://dx.doi.org/10.1109/TKDE.2004.1264824}, alt = {http://www.cs.hku.hk/~dcheung/publication/tkde2004.pdf}, language = {english}, biburl = {http://www.bibsonomy.org/bibtex/2d760492912097f5d18e25749eceb552f/msn}, keywords = {mrefs research.xml cites.gradu research.clustering} } @inproceedings{kogan03hybrid, title = {Text Mining with hybrid clustering schemes}, author = {Jacob Kogan and Charles Nicholas and Vladimir Volkovich}, booktitle = {Workshop on Text Mining, held in conjunction with the Third SIAM International Conference on Data Mining (SDM 2003)}, pages = {5-16}, publisher = {Society for Industrial and Applied Mathematics}, year = 2003, url = {http://www.csee.umbc.edu/~nicholas/clustering/Kogan03a.pdf}, bibsource = {DBLP, http://dblp.uni-trier.de}, language = {english}, biburl = {http://www.bibsonomy.org/bibtex/2655492e9f38e96044e63d56e96bf4f21/msn}, keywords = {research.integration cites.gradu research.mining.text mrefs research.clustering} } @article{jain99clustering, title = {Data clustering: a review}, author = {A. K. Jain and M. N. Murty and P. J. Flynn}, journal = {ACM Comput. Surv.}, number = 3, pages = {264-323}, publisher = {ACM Press}, volume = 31, year = 1999, url = {http://doi.acm.org/10.1145/331499.331504}, language = {english}, biburl = {http://www.bibsonomy.org/bibtex/2b19bcef82a04eb82ee4abde53ee7d1c2/msn}, keywords = {cites.gradu research.clustering mrefs} } @article{he02clustering, title = {Web document clustering using hyperlink structures}, author = {Xiaofeng He and Hongyuan Zha and Chris H.Q. Ding and Horst D. Simon}, journal = {Computational Statistics \&{} Data Analysis}, number = 1, pages = {19-45}, publisher = {Elsevier}, volume = 41, year = 2002, url = {http://dx.doi.org/10.1016/S0167-9473(02)00070-1}, pdf = {he02clustering.pdf}, language = {english}, biburl = {http://www.bibsonomy.org/bibtex/25781c48c37a0486f3f3bf4fd9e9b5c47/msn}, keywords = {cites.gradu research.clustering research.conceptual.graphs mrefs} }