Convolutional neural networks (CNNs) have revolutionized the world of
computer vision over the last few years, pushing image classification beyond
human accuracy. The computational effort of today's CNNs requires power-hungry
parallel processors or GP-GPUs. Recent developments in CNN accelerators for
system-on-chip integration have reduced energy consumption significantly.
Unfortunately, even these highly optimized devices are above the power envelope
imposed by mobile and deeply embedded applications and face hard limitations
caused by CNN weight I/O and storage. This prevents the adoption of CNNs in
future ultra-low power Internet of Things end-nodes for near-sensor analytics.
Recent algorithmic and theoretical advancements enable competitive
classification accuracy even when limiting CNNs to binary (+1/-1) weights
during training. These new findings bring major optimization opportunities in
the arithmetic core by removing the need for expensive multiplications, as well
as reducing I/O bandwidth and storage. In this work, we present an accelerator
optimized for binary-weight CNNs that achieves 1510 GOp/s at 1.2 V on a core
area of only 1.33 MGE (Million Gate Equivalent) or 0.19 mm$^2$ and with a power
dissipation of 895 \muW in UMC 65 nm technology at 0.6 V. Our accelerator
significantly outperforms the state-of-the-art in terms of energy and area
efficiency achieving 61.2 TOp/s/W@0.6 V and 1135 GOp/s/MGE@1.2 V, respectively.
Description
[1606.05487] YodaNN: An Architecture for Ultra-Low Power Binary-Weight CNN Acceleration
%0 Generic
%1 AndCav2016YodaNN
%A Andri, Renzo
%A Cavigelli, Lukas
%A Rossi, Davide
%A Benini, Luca
%D 2016
%K architecture binary cnn compression deep_learning
%T YodaNN: An Architecture for Ultra-Low Power Binary-Weight CNN
Acceleration
%U http://arxiv.org/abs/1606.05487
%X Convolutional neural networks (CNNs) have revolutionized the world of
computer vision over the last few years, pushing image classification beyond
human accuracy. The computational effort of today's CNNs requires power-hungry
parallel processors or GP-GPUs. Recent developments in CNN accelerators for
system-on-chip integration have reduced energy consumption significantly.
Unfortunately, even these highly optimized devices are above the power envelope
imposed by mobile and deeply embedded applications and face hard limitations
caused by CNN weight I/O and storage. This prevents the adoption of CNNs in
future ultra-low power Internet of Things end-nodes for near-sensor analytics.
Recent algorithmic and theoretical advancements enable competitive
classification accuracy even when limiting CNNs to binary (+1/-1) weights
during training. These new findings bring major optimization opportunities in
the arithmetic core by removing the need for expensive multiplications, as well
as reducing I/O bandwidth and storage. In this work, we present an accelerator
optimized for binary-weight CNNs that achieves 1510 GOp/s at 1.2 V on a core
area of only 1.33 MGE (Million Gate Equivalent) or 0.19 mm$^2$ and with a power
dissipation of 895 \muW in UMC 65 nm technology at 0.6 V. Our accelerator
significantly outperforms the state-of-the-art in terms of energy and area
efficiency achieving 61.2 TOp/s/W@0.6 V and 1135 GOp/s/MGE@1.2 V, respectively.
@misc{AndCav2016YodaNN,
abstract = {Convolutional neural networks (CNNs) have revolutionized the world of
computer vision over the last few years, pushing image classification beyond
human accuracy. The computational effort of today's CNNs requires power-hungry
parallel processors or GP-GPUs. Recent developments in CNN accelerators for
system-on-chip integration have reduced energy consumption significantly.
Unfortunately, even these highly optimized devices are above the power envelope
imposed by mobile and deeply embedded applications and face hard limitations
caused by CNN weight I/O and storage. This prevents the adoption of CNNs in
future ultra-low power Internet of Things end-nodes for near-sensor analytics.
Recent algorithmic and theoretical advancements enable competitive
classification accuracy even when limiting CNNs to binary (+1/-1) weights
during training. These new findings bring major optimization opportunities in
the arithmetic core by removing the need for expensive multiplications, as well
as reducing I/O bandwidth and storage. In this work, we present an accelerator
optimized for binary-weight CNNs that achieves 1510 GOp/s at 1.2 V on a core
area of only 1.33 MGE (Million Gate Equivalent) or 0.19 mm$^2$ and with a power
dissipation of 895 {\mu}W in UMC 65 nm technology at 0.6 V. Our accelerator
significantly outperforms the state-of-the-art in terms of energy and area
efficiency achieving 61.2 TOp/s/W@0.6 V and 1135 GOp/s/MGE@1.2 V, respectively.},
added-at = {2018-06-30T14:08:16.000+0200},
author = {Andri, Renzo and Cavigelli, Lukas and Rossi, Davide and Benini, Luca},
biburl = {https://www.bibsonomy.org/bibtex/22256ac8ddb3f65401fd7387e154607ba/loroch},
description = {[1606.05487] YodaNN: An Architecture for Ultra-Low Power Binary-Weight CNN Acceleration},
interhash = {47ef5986e9831c6081c493388db73407},
intrahash = {2256ac8ddb3f65401fd7387e154607ba},
keywords = {architecture binary cnn compression deep_learning},
note = {cite arxiv:1606.05487},
timestamp = {2018-06-30T14:08:16.000+0200},
title = {YodaNN: An Architecture for Ultra-Low Power Binary-Weight CNN
Acceleration},
url = {http://arxiv.org/abs/1606.05487},
year = 2016
}