@article{craswell:esf, title = {Effective Site Finding using Link Anchor Information}, author = {N. Craswell and D. Hawking and S. Robertson}, year = 2001, url = {http://terral.lsi.uned.es/WebMining/Tema3.B%FAsqueda/craswell2001.pdf}, id = {2157311}, priority = {3}, biburl = {http://www.bibsonomy.org/bibtex/27c4a2ad4256210d164d140471a5b3154/brightbyte}, keywords = {link-lining web} } @misc{bernerslee1998uri, title = {Cool URIs don't change}, author = {Tim Berners-Lee}, publisher = {World Wide Web Consortium}, year = 1998, url = {http://www.w3.org/Provider/Style/URI.html}, biburl = {http://www.bibsonomy.org/bibtex/232d7ad905f83bcff44f82b9f1e5953c7/brightbyte}, keywords = {style web semanticweb classic URI} } @article{gleim:wcm, title = {Web Corpus Mining by instance of Wikipedia}, author = {R. Gleim and A. Mehler and M. Dehmer}, journal = {Web as Corpus}, year = 2006, id = {2157099}, priority = {3}, biburl = {http://www.bibsonomy.org/bibtex/2e11c412fbeb95303429892e447598aea/brightbyte}, keywords = {text-mining wikipedia information-retrieval web} } @misc{citeulike:348222, title = {Link Analysis Ranking Algorithms Theory And Experiments}, author = {Allan Borodin and Gareth O. Roberts and Jeffrey S. Rosenthal and Panayiotis Tsaparas}, year = 2005, url = {http://citeseer.ist.psu.edu/borodin04link.html}, id = {348222}, priority = {3}, abstract = {The explosive growth and the widespread accessibility of the Web has led to surge of research activity in the area of information retrieval on the World Wide Web. The seminal papers of Kleinberg [31], and Brin and Page [9] introduced Link Analysis Ranking, where hyperlink structures are used to determine the relative authority of a Web page, and produce improved algorithms for the ranking of Web search results. In this paper we work within the hubs and authorities framework defined by...}, biburl = {http://www.bibsonomy.org/bibtex/2017de3ca82fbf15d92f5dbd18bdb4727/brightbyte}, keywords = {link-mining information-retrieval web} } @article{citeulike:631058, title = {Mining the Web's link structure}, author = {S. Chakrabarti and B. E. Dom and S. R. Kumar and P. Raghavan and S. Rajagopalan and A. Tomkins and D. Gibson and J. Kleinberg}, journal = {Computer}, number = 8, pages = {60--67}, volume = 32, year = 1999, url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=781636}, id = {631058}, priority = {3}, description = {stuff from citeyoulike}, abstract = {The Web is a hypertext body of approximately 300 million pages that continues to grow at roughly a million pages per day. Page variation is more prodigious than the data's raw scale: taken as a whole, the set of Web pages lacks a unifying structure and shows far more authoring style and content variation than that seen in traditional text document collections. This level of complexity makes an \“off-the-shelf\” database management and information retrieval solution impossible. To date, index based search engines for the Web have been the primary tool by which users search for information. Such engines can build giant indices that let you quickly retrieve the set of all Web pages containing a given word or string. Experienced users can make effective use of such engines for tasks that can be solved by searching for tightly constrained key words and phrases. These search engines are, however, unsuited for a wide range of equally important tasks. In particular, a topic of any breadth will typically contain several thousand or million relevant Web pages. How then, from this sea of pages, should a search engine select the correct ones-those of most value to the user? Clever is a search engine that analyzes hyperlinks to uncover two types of pages: authorities, which provide the best source of information on a given topic; and hubs, which provide collections of links to authorities. We outline the thinking that went into Clever's design, report briefly on a study that compared Clever's performance to that of Yahoo and AltaVista, and examine how our system is being extended and updated}, biburl = {http://www.bibsonomy.org/bibtex/2676a8ca47f10b35263ce196e700a8d9e/brightbyte}, keywords = {link-mining information-retrieval web} } @article{citeulike:609165, title = {Link mining: a survey}, address = {New York, NY, USA}, author = {Lise Getoor and Christopher P. Diehl}, journal = {SIGKDD Explor. Newsl.}, month = {December}, number = 2, pages = {3--12}, publisher = {ACM Press}, volume = 7, year = 2005, url = {http://portal.acm.org/citation.cfm?id=1117454.1117456}, id = {609165}, issn = {1931-0145}, priority = {3}, doi = {10.1145/1117454.1117456}, description = {stuff from citeyoulike}, biburl = {http://www.bibsonomy.org/bibtex/2ac02f1d7dea7a106bc4103c8a9ec4aef/brightbyte}, keywords = {link-mining information-retrieval web} } @misc{citeulike:525472, title = {Preferential attachment in the growth of social networks: the case of Wikipedia}, author = {A. Capocci and V. D. P. Servedio and F. Colaiori and L. S. Buriol and D. Donato and S. Leonardi and G. Caldarelli}, month = {Feb}, year = 2006, url = {http://arxiv.org/abs/physics/0602026}, id = {2162756}, priority = {2}, eprint = {physics/0602026}, description = {stuff from citeyoulike}, abstract = {We present an analysis of the statistical properties and growth of the free on-line encyclopedia Wikipedia. By describing topics by vertices and hyperlinks between them as edges, we can represent this encyclopedia as a directed graph. The topological properties of this graph are in close analogy with that of the World Wide Web, despite the very different growth mechanism. In particular we measure a scale--invariant distribution of the in-- and out-- degree and we are able to reproduce these features by means of a simple statistical model. As a major consequence, Wikipedia growth can be described by local rules such as the preferential attachment mechanism, though users can act globally on the network.}, biburl = {http://www.bibsonomy.org/bibtex/277e4e6c1601e81ae88d3dff90a483b86/brightbyte}, keywords = {link-mining wikipedia web small-world} } @inproceedings{citeulike:348187, title = {Using Web structure for classifying and describing {W}eb pages}, author = {Eric J. Glover and Kostas Tsioutsiouliklis and Steve Lawrence and David M. Pennock and Gary W. Flake}, booktitle = {Proceedings of WWW-02, International Conference on the World Wide Web}, year = 2002, url = {http://citeseer.ist.psu.edu/537010.html}, id = {2162763}, priority = {2}, description = {stuff from citeyoulike}, abstract = {The structure of the web is increasingly being used to improve organization, search, and analysis of information on the web. For example, Google uses the text in citing documents (documents that link to the target document) for search. We analyze the relative utility of document text, and the text in citing documents near the citation, for classification and description. Results show that the text in citing documents, when available, often has greater discriminative and descriptive power than...}, biburl = {http://www.bibsonomy.org/bibtex/2c52498517c77d892e621ab8b900c1388/brightbyte}, keywords = {link-mining information-retrieval web} } @book{citeulike:111664, title = {Mining the Web: Analysis of Hypertext and Semi Structured Data}, author = {Soumen Chakrabarti}, howpublished = {Hardcover}, month = {August}, publisher = {{Morgan Kaufmann}}, year = 2002, url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&path=ASIN/1558607544}, id = {111664}, priority = {2}, isbn = {1558607544}, description = {stuff from citeyoulike}, abstract = {{Mining the Web: Discovering Knowledge from Hypertext Data is the first book devoted entirely to techniques for producing knowledge from the vast body of unstructured Web data. Building on an initial survey of infrastructural issuesincluding Web crawling and indexingChakrabarti examines low-level machine learning techniques as they relate specifically to the challenges of Web mining. He then devotes the final part of the book to applications that unite infrastructure and analysis to bring machine learning to bear on systematically acquired and stored data. Here the focus is on results: the strengths and weaknesses of these applications, along with their potential as foundations for further progress. From Chakrabarti's workpainstaking, critical, and forward-lookingreaders will gain the theoretical and practical understanding they need to contribute to the Web mining effort.

* A comprehensive, critical exploration of statistics-based attempts to make sense of Web Mining.
* Details the special challenges associated with analyzing unstructured and semi-structured data.
* Looks at how classical Information Retrieval techniques have been modified for use with Web data.
* Focuses on today's dominant learning methods: clustering and classification, hyperlink analysis, and supervised and semi-supervised learning.
* Analyzes current applications for resource discovery and social network analysis.
* An excellent way to introduce students to especially vital applications of data mining and machine learning technology.}}, biburl = {http://www.bibsonomy.org/bibtex/28010baa00eb2c4f4bf179025258f8fd0/brightbyte}, keywords = {text-mining link-mining information-retrieval web} } @inproceedings{citeulike:542510, title = {The structure of broad topics on the web}, address = {New York, NY, USA}, author = {Soumen Chakrabarti and Mukul M. Joshi and Kunal Punera and David M. Pennock}, booktitle = {WWW '02: Proceedings of the 11th international conference on World Wide Web}, pages = {251--262}, publisher = {ACM Press}, year = 2002, url = {http://portal.acm.org/citation.cfm?id=511480}, id = {542510}, priority = {2}, isbn = {1581134495}, doi = {10.1145/511446.511480}, description = {stuff from citeyoulike}, biburl = {http://www.bibsonomy.org/bibtex/242e804cc0f6553146387a981983e8afc/brightbyte}, keywords = {link-mining web taxonomy} }