Achieving artificial visual reasoning - the ability to answer image-related
questions which require a multi-step, high-level process - is an important step
towards artificial general intelligence. This multi-modal task requires
learning a question-dependent, structured reasoning process over images from
language. Standard deep learning approaches tend to exploit biases in the data
rather than learn this underlying structure, while leading methods learn to
visually reason successfully but are hand-crafted for reasoning. We show that a
general-purpose, Conditional Batch Normalization approach achieves
state-of-the-art results on the CLEVR Visual Reasoning benchmark with a 2.4%
error rate. We outperform the next best end-to-end method (4.5%) and even
methods that use extra supervision (3.1%). We probe our model to shed light on
how it reasons, showing it has learned a question-dependent, multi-step
process. Previous work has operated under the assumption that visual reasoning
calls for a specialized architecture, but we show that a general architecture
with proper conditioning can learn to visually reason effectively.
%0 Generic
%1 perez2017learning
%A Perez, Ethan
%A de Vries, Harm
%A Strub, Florian
%A Dumoulin, Vincent
%A Courville, Aaron
%D 2017
%K conditional-modelling machine-learning style-transfer
%T Learning Visual Reasoning Without Strong Priors
%U http://arxiv.org/abs/1707.03017
%X Achieving artificial visual reasoning - the ability to answer image-related
questions which require a multi-step, high-level process - is an important step
towards artificial general intelligence. This multi-modal task requires
learning a question-dependent, structured reasoning process over images from
language. Standard deep learning approaches tend to exploit biases in the data
rather than learn this underlying structure, while leading methods learn to
visually reason successfully but are hand-crafted for reasoning. We show that a
general-purpose, Conditional Batch Normalization approach achieves
state-of-the-art results on the CLEVR Visual Reasoning benchmark with a 2.4%
error rate. We outperform the next best end-to-end method (4.5%) and even
methods that use extra supervision (3.1%). We probe our model to shed light on
how it reasons, showing it has learned a question-dependent, multi-step
process. Previous work has operated under the assumption that visual reasoning
calls for a specialized architecture, but we show that a general architecture
with proper conditioning can learn to visually reason effectively.
@misc{perez2017learning,
abstract = {Achieving artificial visual reasoning - the ability to answer image-related
questions which require a multi-step, high-level process - is an important step
towards artificial general intelligence. This multi-modal task requires
learning a question-dependent, structured reasoning process over images from
language. Standard deep learning approaches tend to exploit biases in the data
rather than learn this underlying structure, while leading methods learn to
visually reason successfully but are hand-crafted for reasoning. We show that a
general-purpose, Conditional Batch Normalization approach achieves
state-of-the-art results on the CLEVR Visual Reasoning benchmark with a 2.4%
error rate. We outperform the next best end-to-end method (4.5%) and even
methods that use extra supervision (3.1%). We probe our model to shed light on
how it reasons, showing it has learned a question-dependent, multi-step
process. Previous work has operated under the assumption that visual reasoning
calls for a specialized architecture, but we show that a general architecture
with proper conditioning can learn to visually reason effectively.},
added-at = {2017-08-24T23:24:54.000+0200},
author = {Perez, Ethan and de Vries, Harm and Strub, Florian and Dumoulin, Vincent and Courville, Aaron},
biburl = {https://www.bibsonomy.org/bibtex/2dfb60e9a4351d22a36e132e72143dfb5/hprop},
description = {Learning Visual Reasoning Without Strong Priors},
interhash = {7b8d68645a341b561682594674e7b8f0},
intrahash = {dfb60e9a4351d22a36e132e72143dfb5},
keywords = {conditional-modelling machine-learning style-transfer},
note = {cite arxiv:1707.03017Comment: This work was presented at ICML 2017's Machine Learning in Speech and Language Processing Workshop},
timestamp = {2017-08-24T23:24:54.000+0200},
title = {Learning Visual Reasoning Without Strong Priors},
url = {http://arxiv.org/abs/1707.03017},
year = 2017
}