Applying convolutional neural networks to large images is computationally expensive
because the amount of computation scales linearly with the number of
image pixels. We present a novel recurrent neural network model that is capable
of extracting information from an image or video by adaptively selecting
a sequence of regions or locations and only processing the selected regions at
high resolution. Like convolutional neural networks, the proposed model has a
degree of translation invariance built-in, but the amount of computation it performs
can be controlled independently of the input image size. While the model
is non-differentiable, it can be trained using reinforcement learning methods to
learn task-specific policies. We evaluate our model on several image classification
tasks, where it significantly outperforms a convolutional neural network baseline
on cluttered images, and on a dynamic visual control problem, where it learns to
track a simple object without an explicit training signal for doing so.
%0 Conference Paper
%1 Mnih2014RecurrentMO
%A Mnih, Volodymyr
%A Heess, Nicolas
%A Graves, Alex
%A Kavukcuoglu, Koray
%B NIPS
%D 2014
%K attention deeplearning
%T Recurrent Models of Visual Attention
%X Applying convolutional neural networks to large images is computationally expensive
because the amount of computation scales linearly with the number of
image pixels. We present a novel recurrent neural network model that is capable
of extracting information from an image or video by adaptively selecting
a sequence of regions or locations and only processing the selected regions at
high resolution. Like convolutional neural networks, the proposed model has a
degree of translation invariance built-in, but the amount of computation it performs
can be controlled independently of the input image size. While the model
is non-differentiable, it can be trained using reinforcement learning methods to
learn task-specific policies. We evaluate our model on several image classification
tasks, where it significantly outperforms a convolutional neural network baseline
on cluttered images, and on a dynamic visual control problem, where it learns to
track a simple object without an explicit training signal for doing so.
@inproceedings{Mnih2014RecurrentMO,
abstract = {Applying convolutional neural networks to large images is computationally expensive
because the amount of computation scales linearly with the number of
image pixels. We present a novel recurrent neural network model that is capable
of extracting information from an image or video by adaptively selecting
a sequence of regions or locations and only processing the selected regions at
high resolution. Like convolutional neural networks, the proposed model has a
degree of translation invariance built-in, but the amount of computation it performs
can be controlled independently of the input image size. While the model
is non-differentiable, it can be trained using reinforcement learning methods to
learn task-specific policies. We evaluate our model on several image classification
tasks, where it significantly outperforms a convolutional neural network baseline
on cluttered images, and on a dynamic visual control problem, where it learns to
track a simple object without an explicit training signal for doing so.},
added-at = {2016-10-28T10:36:55.000+0200},
author = {Mnih, Volodymyr and Heess, Nicolas and Graves, Alex and Kavukcuoglu, Koray},
biburl = {https://www.bibsonomy.org/bibtex/215cb47c9ab072e5350f804c3509cbcf7/dallmann},
booktitle = {NIPS},
interhash = {1e0f57399dcaf547e3a14bc21b1a5fbd},
intrahash = {15cb47c9ab072e5350f804c3509cbcf7},
keywords = {attention deeplearning},
timestamp = {2016-10-28T10:36:55.000+0200},
title = {Recurrent Models of Visual Attention},
year = 2014
}