@inproceedings{Duygulu2002,
title = {Object recognition as machine translation: learning a lexicon for
a fixed image vocabulary},
author = {Pinar Duygulu and Kobus Barnard and Nando de Freitas and David Forsyth},
booktitle = {7th European Conference on Computer Vision},
pages = {97--112},
url = {http://kobus.ca/research/publications/ECCV-02-1/ECCV-02-1.pdf},
year = {2002},
abstract = {We describe a model of object recognition as machine translation.
In this model, recognition is a process of annotating image regions
with words. Firstly, images are segmented into regions, which are
classified into region types using a variety of features. A mapping
between region types and keywords supplied with the images, is then
learned, using a method based around EM. This process is analogous
with learning a lexicon from an aligned bitext. For the implementation
we describe, these words are nouns taken from a large vocabulary.
On a large test set, the method can predict numerous words with high
accuracy. Simple methods identify words that cannot be predicted
well. We show how to cluster words that individually are difficult
to predict into clusters that can be predicted well --- for example,
we cannot predict the distinction between train and locomotive using
the current set of features, but we can predict the underlying concept.
The method is trained on a substantial collection of images. Extensive
experimental results illustrate the strengths and weaknesses of the
approach.},
timestamp = {2007.09.26}, owner = {Marco},
keywords = {ImageAnnotation }
}