Textual grounding, i.e., linking words to objects in images, is a challenging
but important task for robotics and human-computer interaction. Existing
techniques benefit from recent progress in deep learning and generally
formulate the task as a supervised learning problem, selecting a bounding box
from a set of possible options. To train these deep net based approaches,
access to a large-scale datasets is required, however, constructing such a
dataset is time-consuming and expensive. Therefore, we develop a completely
unsupervised mechanism for textual grounding using hypothesis testing as a
mechanism to link words to detected image concepts. We demonstrate our approach
on the ReferIt Game dataset and the Flickr30k data, outperforming baselines by
7.98\% and 6.96\% respectively.
%0 Generic
%1 citeulike:14581108
%A xxx,
%D 2018
%K attention detection grounding semisup
%T Unsupervised Textual Grounding: Linking Words to Image Concepts
%U http://arxiv.org/abs/1803.11185
%X Textual grounding, i.e., linking words to objects in images, is a challenging
but important task for robotics and human-computer interaction. Existing
techniques benefit from recent progress in deep learning and generally
formulate the task as a supervised learning problem, selecting a bounding box
from a set of possible options. To train these deep net based approaches,
access to a large-scale datasets is required, however, constructing such a
dataset is time-consuming and expensive. Therefore, we develop a completely
unsupervised mechanism for textual grounding using hypothesis testing as a
mechanism to link words to detected image concepts. We demonstrate our approach
on the ReferIt Game dataset and the Flickr30k data, outperforming baselines by
7.98\% and 6.96\% respectively.
@misc{citeulike:14581108,
abstract = {{Textual grounding, i.e., linking words to objects in images, is a challenging
but important task for robotics and human-computer interaction. Existing
techniques benefit from recent progress in deep learning and generally
formulate the task as a supervised learning problem, selecting a bounding box
from a set of possible options. To train these deep net based approaches,
access to a large-scale datasets is required, however, constructing such a
dataset is time-consuming and expensive. Therefore, we develop a completely
unsupervised mechanism for textual grounding using hypothesis testing as a
mechanism to link words to detected image concepts. We demonstrate our approach
on the ReferIt Game dataset and the Flickr30k data, outperforming baselines by
7.98\% and 6.96\% respectively.}},
added-at = {2019-02-27T22:23:29.000+0100},
archiveprefix = {arXiv},
author = {xxx},
biburl = {https://www.bibsonomy.org/bibtex/29edbb55aa9717d9cb08fe8982e5a2d7f/nmatsuk},
citeulike-article-id = {14581108},
citeulike-linkout-0 = {http://arxiv.org/abs/1803.11185},
citeulike-linkout-1 = {http://arxiv.org/pdf/1803.11185},
day = 29,
eprint = {1803.11185},
interhash = {241d8c62bd2882d41d4457ed57e8e931},
intrahash = {9edbb55aa9717d9cb08fe8982e5a2d7f},
keywords = {attention detection grounding semisup},
month = mar,
posted-at = {2018-05-05 16:16:48},
priority = {0},
timestamp = {2019-02-27T22:23:29.000+0100},
title = {{Unsupervised Textual Grounding: Linking Words to Image Concepts}},
url = {http://arxiv.org/abs/1803.11185},
year = 2018
}