We propose VisualBERT, a simple and flexible framework for modeling a broad
range of vision-and-language tasks. VisualBERT consists of a stack of
Transformer layers that implicitly align elements of an input text and regions
in an associated input image with self-attention. We further propose two
visually-grounded language model objectives for pre-training VisualBERT on
image caption data. Experiments on four vision-and-language tasks including
VQA, VCR, NLVR2, and Flickr30K show that VisualBERT outperforms or rivals with
state-of-the-art models while being significantly simpler. Further analysis
demonstrates that VisualBERT can ground elements of language to image regions
without any explicit supervision and is even sensitive to syntactic
relationships, tracking, for example, associations between verbs and image
regions corresponding to their arguments.
Description
[1908.03557] VisualBERT: A Simple and Performant Baseline for Vision and Language
%0 Generic
%1 li2019visualbert
%A Li, Liunian Harold
%A Yatskar, Mark
%A Yin, Da
%A Hsieh, Cho-Jui
%A Chang, Kai-Wei
%D 2019
%K bert image text vision visualbert
%T VisualBERT: A Simple and Performant Baseline for Vision and Language
%U http://arxiv.org/abs/1908.03557
%X We propose VisualBERT, a simple and flexible framework for modeling a broad
range of vision-and-language tasks. VisualBERT consists of a stack of
Transformer layers that implicitly align elements of an input text and regions
in an associated input image with self-attention. We further propose two
visually-grounded language model objectives for pre-training VisualBERT on
image caption data. Experiments on four vision-and-language tasks including
VQA, VCR, NLVR2, and Flickr30K show that VisualBERT outperforms or rivals with
state-of-the-art models while being significantly simpler. Further analysis
demonstrates that VisualBERT can ground elements of language to image regions
without any explicit supervision and is even sensitive to syntactic
relationships, tracking, for example, associations between verbs and image
regions corresponding to their arguments.
@misc{li2019visualbert,
abstract = {We propose VisualBERT, a simple and flexible framework for modeling a broad
range of vision-and-language tasks. VisualBERT consists of a stack of
Transformer layers that implicitly align elements of an input text and regions
in an associated input image with self-attention. We further propose two
visually-grounded language model objectives for pre-training VisualBERT on
image caption data. Experiments on four vision-and-language tasks including
VQA, VCR, NLVR2, and Flickr30K show that VisualBERT outperforms or rivals with
state-of-the-art models while being significantly simpler. Further analysis
demonstrates that VisualBERT can ground elements of language to image regions
without any explicit supervision and is even sensitive to syntactic
relationships, tracking, for example, associations between verbs and image
regions corresponding to their arguments.},
added-at = {2020-09-11T18:30:35.000+0200},
author = {Li, Liunian Harold and Yatskar, Mark and Yin, Da and Hsieh, Cho-Jui and Chang, Kai-Wei},
biburl = {https://www.bibsonomy.org/bibtex/2e81d7595cc33a65eb168b664bb875af0/nosebrain},
description = {[1908.03557] VisualBERT: A Simple and Performant Baseline for Vision and Language},
interhash = {cf9b3913ea6c465548e5cfcaa76b1e2a},
intrahash = {e81d7595cc33a65eb168b664bb875af0},
keywords = {bert image text vision visualbert},
note = {cite arxiv:1908.03557Comment: Work in Progress},
timestamp = {2020-09-11T18:30:35.000+0200},
title = {VisualBERT: A Simple and Performant Baseline for Vision and Language},
url = {http://arxiv.org/abs/1908.03557},
year = 2019
}