| Authors: |
Ana Maguitman
and Filippo Menczer
and Fulya Erdinc
and Heather Roinestad
and Alessandro Vespignani
|
| URL: |
http://dx.doi.org/10.1007/s11280-006-8562-2 |
| Description: |
SpringerLink - Journal Article |
| Tags: |
imported
opendirectory
semantic
similarity
text
validation
|
| Abstract: |
Automatic extraction of semantic information from text and links in Web pages is key to improving the quality of search results.
However, the assessment of automatic semantic measures is limited by the coverage of user studies, which do not scale withthe size, heterogeneity, and growth of the Web. Here we propose to leverage human-generated metadataânamely topical directoriesâtomeasure semantic relationships among massive numbers of pairs of Web pages or topics. The Open Directory Project classifiesmillions of URLs in a topical ontology, providing a rich source from which semantic relationships between Web pages can bederived. While semantic similarity measures based on taxonomies (trees) are well studied, the design of well-founded similaritymeasures for objects stored in the nodes of arbitrary ontologies (graphs) is an open problem. This paper defines an information-theoreticmeasure of semantic similarity that exploits both the hierarchical and non-hierarchical structure of an ontology. An experimentalstudy shows that this measure improves significantly on the traditional taxonomy-based approach. This novel measure allowsus to address the general question of how text and link analyses can be combined to derive measures of relevance that arein good agreement with semantic similarity. Surprisingly, the traditional use of text similarity turns out to be ineffectivefor relevance ranking. |
@article{keyhere,
title = {Algorithmic Computation and Approximation of Semantic Similarity},
author = {Ana Maguitman and Filippo Menczer and Fulya Erdinc and Heather Roinestad and Alessandro Vespignani},
journal = {World Wide Web},
month = {#dec#},
number = {4},
pages = {431--456},
url = {http://dx.doi.org/10.1007/s11280-006-8562-2},
volume = {9},
year = {2006},
description = {SpringerLink - Journal Article},
abstract = {Automatic extraction of semantic information from text and links in Web pages is key to improving the quality of search results.
However, the assessment of automatic semantic measures is limited by the coverage of user studies, which do not scale withthe size, heterogeneity, and growth of the Web. Here we propose to leverage human-generated metadataânamely topical directoriesâtomeasure semantic relationships among massive numbers of pairs of Web pages or topics. The Open Directory Project classifiesmillions of URLs in a topical ontology, providing a rich source from which semantic relationships between Web pages can bederived. While semantic similarity measures based on taxonomies (trees) are well studied, the design of well-founded similaritymeasures for objects stored in the nodes of arbitrary ontologies (graphs) is an open problem. This paper defines an information-theoreticmeasure of semantic similarity that exploits both the hierarchical and non-hierarchical structure of an ontology. An experimentalstudy shows that this measure improves significantly on the traditional taxonomy-based approach. This novel measure allowsus to address the general question of how text and link analyses can be combined to derive measures of relevance that arein good agreement with semantic similarity. Surprisingly, the traditional use of text similarity turns out to be ineffectivefor relevance ranking.},
keywords = {imported opendirectory semantic similarity text validation }
}