This paper derives a speech parameter generation algorithm for HMM-based speech synthesis, in which the speech parameter sequence is generated from HMMs whose observation vector consists of a spectral parameter vector and its dynamic feature vectors. In the algorithm, we assume that the state sequence (state and mixture sequence for the multi-mixture case) or a part of the state sequence is unobservable (i.e., hidden or latent). As a result, the algorithm iterates the forward-backward algorithm and the parameter generation algorithm for the case where the state sequence is given. Experimental results show that by using the algorithm, we can reproduce clear formant structure from multi-mixture HMMs as compared with that produced from single-mixture HMMs.
%0 Conference Paper
%1 Tokuda2000
%A Tokuda, Keiichi
%A Yoshimura, Takayoshi
%A Masuko, Takashi
%A Kobayashi, Takao
%A Kitamura, Tadashi
%B Proceedings of the 2000 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)
%C Istanbul, Turkey
%D 2000
%K HMM-based HMM;observation Markov algorithm;multi-mixture algorithms;speech estimation;speech feature generation likelihood models;maximum parameter sequence;hidden sequence;state speech structure;forward-backward synthesis; synthesis;dynamic vector;formant vector;spectral vector;speech
%P 1315-1318
%R 10.1109/ICASSP.2000.861820
%T Speech parameter generation algorithms for HMM-based speech synthesis
%V 3
%X This paper derives a speech parameter generation algorithm for HMM-based speech synthesis, in which the speech parameter sequence is generated from HMMs whose observation vector consists of a spectral parameter vector and its dynamic feature vectors. In the algorithm, we assume that the state sequence (state and mixture sequence for the multi-mixture case) or a part of the state sequence is unobservable (i.e., hidden or latent). As a result, the algorithm iterates the forward-backward algorithm and the parameter generation algorithm for the case where the state sequence is given. Experimental results show that by using the algorithm, we can reproduce clear formant structure from multi-mixture HMMs as compared with that produced from single-mixture HMMs.
@inproceedings{Tokuda2000,
abstract = {This paper derives a speech parameter generation algorithm for HMM-based speech synthesis, in which the speech parameter sequence is generated from HMMs whose observation vector consists of a spectral parameter vector and its dynamic feature vectors. In the algorithm, we assume that the state sequence (state and mixture sequence for the multi-mixture case) or a part of the state sequence is unobservable (i.e., hidden or latent). As a result, the algorithm iterates the forward-backward algorithm and the parameter generation algorithm for the case where the state sequence is given. Experimental results show that by using the algorithm, we can reproduce clear formant structure from multi-mixture HMMs as compared with that produced from single-mixture HMMs.},
added-at = {2021-02-01T10:51:23.000+0100},
address = {Istanbul, Turkey},
author = {Tokuda, Keiichi and Yoshimura, Takayoshi and Masuko, Takashi and Kobayashi, Takao and Kitamura, Tadashi},
biburl = {https://www.bibsonomy.org/bibtex/26319641db4b0fec7e17c7782dd55e6eb/m-toman},
booktitle = {Proceedings of the 2000 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
doi = {10.1109/ICASSP.2000.861820},
file = {:pdfs/tokuda_icassp_2000.pdf:PDF},
interhash = {42e9aa762963e208ebafe64ef3b69a6c},
intrahash = {6319641db4b0fec7e17c7782dd55e6eb},
issn = {1520-6149},
keywords = {HMM-based HMM;observation Markov algorithm;multi-mixture algorithms;speech estimation;speech feature generation likelihood models;maximum parameter sequence;hidden sequence;state speech structure;forward-backward synthesis; synthesis;dynamic vector;formant vector;spectral vector;speech},
month = jun,
owner = {schabus},
pages = {1315-1318},
timestamp = {2021-02-01T10:51:23.000+0100},
title = {Speech parameter generation algorithms for {HMM}-based speech synthesis},
volume = 3,
year = 2000
}