This paper describes an HMM-based speech synthesis system (HTS), in which the speech waveform is generated from HMM themselves, and applies it to English speech synthesis using the general speech synthesis architecture of Festival. Similarly to other data-driven speech synthesis approaches, HTS has a compact language dependent module: a list of contextual factors. Thus, it could easily be extended to other languages, though the first version of HTS was implemented for Japanese. The resulting run-time engine of HTS has the advantage of being small: less than 1 Mbyte, excluding text analysis part. Furthermore, HTS can easily change voice characteristics of synthesized speech by using a speaker adaptation technique developed for speech recognition. The relation between the HMM-based approach and other unit selection approaches is also discussed.
%0 Conference Paper
%1 Tokuda2002
%A Tokuda, Keiichi
%A Zen, Heiga
%A Black, Alan W.
%B Proceedings of the 2002 IEEE Workshop on Speech Synthesis
%C Santa Monica, CA, USA
%D 2002
%K Markov adaptation;speech dependent extraction;Runtime;Signal factors;language generation;Computer hidden language;Festival;HMM;HTS;contextual languages;Parameter languages;speech models;High models;natural module;speaker science;Databases;Engines;Hidden superconductors;Natural synthesis synthesis;English synthesis;Speech system;speech temperature waveform
%P 227-230
%R 10.1109/WSS.2002.1224415
%T An HMM-based speech synthesis system applied to English
%X This paper describes an HMM-based speech synthesis system (HTS), in which the speech waveform is generated from HMM themselves, and applies it to English speech synthesis using the general speech synthesis architecture of Festival. Similarly to other data-driven speech synthesis approaches, HTS has a compact language dependent module: a list of contextual factors. Thus, it could easily be extended to other languages, though the first version of HTS was implemented for Japanese. The resulting run-time engine of HTS has the advantage of being small: less than 1 Mbyte, excluding text analysis part. Furthermore, HTS can easily change voice characteristics of synthesized speech by using a speaker adaptation technique developed for speech recognition. The relation between the HMM-based approach and other unit selection approaches is also discussed.
@inproceedings{Tokuda2002,
abstract = {This paper describes an HMM-based speech synthesis system (HTS), in which the speech waveform is generated from HMM themselves, and applies it to English speech synthesis using the general speech synthesis architecture of Festival. Similarly to other data-driven speech synthesis approaches, HTS has a compact language dependent module: a list of contextual factors. Thus, it could easily be extended to other languages, though the first version of HTS was implemented for Japanese. The resulting run-time engine of HTS has the advantage of being small: less than 1 Mbyte, excluding text analysis part. Furthermore, HTS can easily change voice characteristics of synthesized speech by using a speaker adaptation technique developed for speech recognition. The relation between the HMM-based approach and other unit selection approaches is also discussed.},
added-at = {2021-02-01T10:51:23.000+0100},
address = {Santa Monica, CA, USA},
author = {Tokuda, Keiichi and Zen, Heiga and Black, Alan W.},
biburl = {https://www.bibsonomy.org/bibtex/22e222854fa1a473e24cc93592478380f/m-toman},
booktitle = {Proceedings of the 2002 IEEE Workshop on Speech Synthesis},
doi = {10.1109/WSS.2002.1224415},
file = {:pdfs/tokuda_ieeesynth_2002.pdf:PDF},
interhash = {81425597a55b4408c60f1a04d4fe95ba},
intrahash = {2e222854fa1a473e24cc93592478380f},
keywords = {Markov adaptation;speech dependent extraction;Runtime;Signal factors;language generation;Computer hidden language;Festival;HMM;HTS;contextual languages;Parameter languages;speech models;High models;natural module;speaker science;Databases;Engines;Hidden superconductors;Natural synthesis synthesis;English synthesis;Speech system;speech temperature waveform},
month = sep,
owner = {schabus},
pages = {227-230},
timestamp = {2021-02-01T10:51:23.000+0100},
title = {An HMM-based speech synthesis system applied to English},
year = 2002
}