A trend in automatic speech recognition systems is the use of continuous mixture-density hidden Markov models (HMMs). Despite the good recognition performance that these systems achieve on average in large vocabulary applications, there is a large variability in performance across speakers. Performance degrades dramatically when the user is radically different from the training population. A popular technique that can improve the performance and robustness of a speech recognition system is adapting speech models to the speaker, and more generally to the channel and the task. In continuous mixture-density HMMs the number of component densities is typically very large, and it may not be feasible to acquire a sufficient amount of adaptation data for robust maximum-likelihood estimates. To solve this problem, the authors propose a constrained estimation technique for Gaussian mixture densities. The algorithm is evaluated on the large-vocabulary Wall Street Journal corpus for both native and nonnative speakers of American English. For nonnative speakers, the recognition error rate is approximately halved with only a small amount of adaptation data, and it approaches the speaker-independent accuracy achieved for native speakers. For native speakers, the recognition performance after adaptation improves to the accuracy of speaker-dependent systems that use six times as much training data.
%0 Journal Article
%1 Digalakis1995
%A Digalakis, Vassilios V.
%A Rtischev, Dimitry
%A Neumeyer, Leonardo G.
%D 1995
%J IEEE Transactions on Speech and Audio Processing
%K English;Gaussian Gaussian Journal Markov Street Wall adaptation;Automatic analysis;Hidden corpus;performance;robust data;Vocabulary densities;constrained distribution;Robustness;Speech estimate;speaker estimation;Probability estimation;continuous hidden likelihood maximum-likelihood mixture-density mixtures;automatic models;Maximum models;error models;speech processes;error rate;large-vocabulary recognition recognition;American recognition;Degradation;Error recognition;Training speech statistics;hidden systems;component
%N 5
%P 357-366
%R 10.1109/89.466659
%T Speaker adaptation using constrained estimation of Gaussian mixtures
%V 3
%X A trend in automatic speech recognition systems is the use of continuous mixture-density hidden Markov models (HMMs). Despite the good recognition performance that these systems achieve on average in large vocabulary applications, there is a large variability in performance across speakers. Performance degrades dramatically when the user is radically different from the training population. A popular technique that can improve the performance and robustness of a speech recognition system is adapting speech models to the speaker, and more generally to the channel and the task. In continuous mixture-density HMMs the number of component densities is typically very large, and it may not be feasible to acquire a sufficient amount of adaptation data for robust maximum-likelihood estimates. To solve this problem, the authors propose a constrained estimation technique for Gaussian mixture densities. The algorithm is evaluated on the large-vocabulary Wall Street Journal corpus for both native and nonnative speakers of American English. For nonnative speakers, the recognition error rate is approximately halved with only a small amount of adaptation data, and it approaches the speaker-independent accuracy achieved for native speakers. For native speakers, the recognition performance after adaptation improves to the accuracy of speaker-dependent systems that use six times as much training data.
@article{Digalakis1995,
abstract = {A trend in automatic speech recognition systems is the use of continuous mixture-density hidden Markov models (HMMs). Despite the good recognition performance that these systems achieve on average in large vocabulary applications, there is a large variability in performance across speakers. Performance degrades dramatically when the user is radically different from the training population. A popular technique that can improve the performance and robustness of a speech recognition system is adapting speech models to the speaker, and more generally to the channel and the task. In continuous mixture-density HMMs the number of component densities is typically very large, and it may not be feasible to acquire a sufficient amount of adaptation data for robust maximum-likelihood estimates. To solve this problem, the authors propose a constrained estimation technique for Gaussian mixture densities. The algorithm is evaluated on the large-vocabulary Wall Street Journal corpus for both native and nonnative speakers of American English. For nonnative speakers, the recognition error rate is approximately halved with only a small amount of adaptation data, and it approaches the speaker-independent accuracy achieved for native speakers. For native speakers, the recognition performance after adaptation improves to the accuracy of speaker-dependent systems that use six times as much training data.},
added-at = {2021-02-01T10:51:23.000+0100},
author = {Digalakis, Vassilios V. and Rtischev, Dimitry and Neumeyer, Leonardo G.},
biburl = {https://www.bibsonomy.org/bibtex/2231cdba1bf101bab7dc99320002dd259/m-toman},
doi = {10.1109/89.466659},
file = {:pdfs/digalakis_ieeetrans_1995.pdf:PDF},
interhash = {4dcea259ea50142d0f990b22368f4bc4},
intrahash = {231cdba1bf101bab7dc99320002dd259},
issn = {1063-6676},
journal = {IEEE Transactions on Speech and Audio Processing},
keywords = {English;Gaussian Gaussian Journal Markov Street Wall adaptation;Automatic analysis;Hidden corpus;performance;robust data;Vocabulary densities;constrained distribution;Robustness;Speech estimate;speaker estimation;Probability estimation;continuous hidden likelihood maximum-likelihood mixture-density mixtures;automatic models;Maximum models;error models;speech processes;error rate;large-vocabulary recognition recognition;American recognition;Degradation;Error recognition;Training speech statistics;hidden systems;component},
month = sep,
number = 5,
owner = {schabus},
pages = {357-366},
timestamp = {2021-02-01T10:51:23.000+0100},
title = {Speaker adaptation using constrained estimation of Gaussian mixtures},
volume = 3,
year = 1995
}