We present a state-of-the-art speech recognition system developed using
end-to-end deep learning. Our architecture is significantly simpler than
traditional speech systems, which rely on laboriously engineered processing
pipelines; these traditional systems also tend to perform poorly when used in
noisy environments. In contrast, our system does not need hand-designed
components to model background noise, reverberation, or speaker variation, but
instead directly learns a function that is robust to such effects. We do not
need a phoneme dictionary, nor even the concept of a "phoneme." Key to our
approach is a well-optimized RNN training system that uses multiple GPUs, as
well as a set of novel data synthesis techniques that allow us to efficiently
obtain a large amount of varied data for training. Our system, called Deep
Speech, outperforms previously published results on the widely studied
Switchboard Hub5'00, achieving 16.0% error on the full test set. Deep Speech
also handles challenging noisy environments better than widely used,
state-of-the-art commercial speech systems.
Description
[1412.5567] Deep Speech: Scaling up end-to-end speech recognition
%0 Generic
%1 hannun2014speech
%A Hannun, Awni
%A Case, Carl
%A Casper, Jared
%A Catanzaro, Bryan
%A Diamos, Greg
%A Elsen, Erich
%A Prenger, Ryan
%A Satheesh, Sanjeev
%A Sengupta, Shubho
%A Coates, Adam
%A Ng, Andrew Y.
%D 2014
%K 2014 arxiv baidu deep-learning sound speech
%T Deep Speech: Scaling up end-to-end speech recognition
%U http://arxiv.org/abs/1412.5567
%X We present a state-of-the-art speech recognition system developed using
end-to-end deep learning. Our architecture is significantly simpler than
traditional speech systems, which rely on laboriously engineered processing
pipelines; these traditional systems also tend to perform poorly when used in
noisy environments. In contrast, our system does not need hand-designed
components to model background noise, reverberation, or speaker variation, but
instead directly learns a function that is robust to such effects. We do not
need a phoneme dictionary, nor even the concept of a "phoneme." Key to our
approach is a well-optimized RNN training system that uses multiple GPUs, as
well as a set of novel data synthesis techniques that allow us to efficiently
obtain a large amount of varied data for training. Our system, called Deep
Speech, outperforms previously published results on the widely studied
Switchboard Hub5'00, achieving 16.0% error on the full test set. Deep Speech
also handles challenging noisy environments better than widely used,
state-of-the-art commercial speech systems.
@misc{hannun2014speech,
abstract = {We present a state-of-the-art speech recognition system developed using
end-to-end deep learning. Our architecture is significantly simpler than
traditional speech systems, which rely on laboriously engineered processing
pipelines; these traditional systems also tend to perform poorly when used in
noisy environments. In contrast, our system does not need hand-designed
components to model background noise, reverberation, or speaker variation, but
instead directly learns a function that is robust to such effects. We do not
need a phoneme dictionary, nor even the concept of a "phoneme." Key to our
approach is a well-optimized RNN training system that uses multiple GPUs, as
well as a set of novel data synthesis techniques that allow us to efficiently
obtain a large amount of varied data for training. Our system, called Deep
Speech, outperforms previously published results on the widely studied
Switchboard Hub5'00, achieving 16.0% error on the full test set. Deep Speech
also handles challenging noisy environments better than widely used,
state-of-the-art commercial speech systems.},
added-at = {2018-04-01T16:07:36.000+0200},
author = {Hannun, Awni and Case, Carl and Casper, Jared and Catanzaro, Bryan and Diamos, Greg and Elsen, Erich and Prenger, Ryan and Satheesh, Sanjeev and Sengupta, Shubho and Coates, Adam and Ng, Andrew Y.},
biburl = {https://www.bibsonomy.org/bibtex/218ffb1615c008c01737b3c95ea044045/achakraborty},
description = {[1412.5567] Deep Speech: Scaling up end-to-end speech recognition},
interhash = {61da83f49db2cc0c2c5a15ef9a266834},
intrahash = {18ffb1615c008c01737b3c95ea044045},
keywords = {2014 arxiv baidu deep-learning sound speech},
note = {cite arxiv:1412.5567},
timestamp = {2018-04-01T16:07:36.000+0200},
title = {Deep Speech: Scaling up end-to-end speech recognition},
url = {http://arxiv.org/abs/1412.5567},
year = 2014
}