Deep Bidirectional LSTM (DBLSTM) recurrent neural networks
have recently been shown to give state-of-the-art performance
on the TIMIT speech database. However, the results
in that work relied on recurrent-neural-network-specific
objective functions, which are difficult to integrate with existing
large vocabulary speech recognition systems. This paper
investigates the use of DBLSTM as an acoustic model in a
standard neural network-HMM hybrid system. We find that a
DBLSTM-HMM hybrid gives equally good results on TIMIT
as the previous work. It also outperforms both GMM and
deep network benchmarks on a subset of the Wall Street Journal
corpus. However the improvement in word error rate over
the deep network is modest, despite a great increase in framelevel
accuracy. We conclude that the hybrid approach with
DBLSTM appears to be well suited for tasks where acoustic
modelling predominates. Further investigation needs to be
conducted to understand how to better leverage the improvements
in frame-level accuracy towards better word error rates.
%0 Conference Paper
%1 Graves2013HybridSR
%A Graves, Alex
%A Jaitly, Navdeep
%A rahman Mohamed, Abdel
%B ASRU
%D 2013
%K deep_learning lstm
%T Hybrid speech recognition with Deep Bidirectional LSTM
%X Deep Bidirectional LSTM (DBLSTM) recurrent neural networks
have recently been shown to give state-of-the-art performance
on the TIMIT speech database. However, the results
in that work relied on recurrent-neural-network-specific
objective functions, which are difficult to integrate with existing
large vocabulary speech recognition systems. This paper
investigates the use of DBLSTM as an acoustic model in a
standard neural network-HMM hybrid system. We find that a
DBLSTM-HMM hybrid gives equally good results on TIMIT
as the previous work. It also outperforms both GMM and
deep network benchmarks on a subset of the Wall Street Journal
corpus. However the improvement in word error rate over
the deep network is modest, despite a great increase in framelevel
accuracy. We conclude that the hybrid approach with
DBLSTM appears to be well suited for tasks where acoustic
modelling predominates. Further investigation needs to be
conducted to understand how to better leverage the improvements
in frame-level accuracy towards better word error rates.
@inproceedings{Graves2013HybridSR,
abstract = {Deep Bidirectional LSTM (DBLSTM) recurrent neural networks
have recently been shown to give state-of-the-art performance
on the TIMIT speech database. However, the results
in that work relied on recurrent-neural-network-specific
objective functions, which are difficult to integrate with existing
large vocabulary speech recognition systems. This paper
investigates the use of DBLSTM as an acoustic model in a
standard neural network-HMM hybrid system. We find that a
DBLSTM-HMM hybrid gives equally good results on TIMIT
as the previous work. It also outperforms both GMM and
deep network benchmarks on a subset of the Wall Street Journal
corpus. However the improvement in word error rate over
the deep network is modest, despite a great increase in framelevel
accuracy. We conclude that the hybrid approach with
DBLSTM appears to be well suited for tasks where acoustic
modelling predominates. Further investigation needs to be
conducted to understand how to better leverage the improvements
in frame-level accuracy towards better word error rates.},
added-at = {2016-11-15T09:24:08.000+0100},
author = {Graves, Alex and Jaitly, Navdeep and rahman Mohamed, Abdel},
biburl = {https://www.bibsonomy.org/bibtex/25107c9334622aa9b59e7210775aeeaaf/dallmann},
booktitle = {ASRU},
interhash = {ca150b6bd7634c0b96a94a38923d170d},
intrahash = {5107c9334622aa9b59e7210775aeeaaf},
keywords = {deep_learning lstm},
timestamp = {2016-11-15T09:25:30.000+0100},
title = {Hybrid speech recognition with Deep Bidirectional LSTM},
year = 2013
}