This article presents a segmental vocoder driven by ultrasound
and optical images (standard CCD camera) of the tongue and lips
for a ``silent speech interface'' application, usable either by a
laryngectomized patient or for silent communication. The system
is built around an audio--visual dictionary which associates
visual to acoustic observations for each phonetic class. Visual
features are extracted from ultrasound images of the tongue and
from video images of the lips using a PCA-based image coding
technique. Visual observations of each phonetic class are modeled
by continuous HMMs. The system then combines a phone recognition
stage with corpus-based synthesis. In the recognition stage, the
visual HMMs are used to identify phonetic targets in a sequence
of visual features. In the synthesis stage, these phonetic
targets constrain the dictionary search for the sequence of
diphones that maximizes similarity to the input test data in the
visual space, subject to a concatenation cost in the acoustic
domain. A prosody-template is extracted from the training corpus,
and the final speech waveform is generated using ``Harmonic plus
Noise Model'' concatenative synthesis techniques. Experimental
results are based on an audiovisual database containing 1h of
continuous speech from each of two speakers.
%0 Journal Article
%1 Hueber2010-cm
%A Hueber, Thomas
%A Benaroya, Elie-Laurent
%A Chollet, Gérard
%A Denby, Bruce
%A Dreyfus, Gérard
%A Stone, Maureen
%D 2010
%J Speech Commun.
%K Corpus-based Silent Ultrasound; Visual phone recognition;subvocal speech speech; synthesis;
%N 4
%P 288--300
%T Development of a silent speech interface driven by ultrasound and
optical images of the tongue and lips
%V 52
%X This article presents a segmental vocoder driven by ultrasound
and optical images (standard CCD camera) of the tongue and lips
for a ``silent speech interface'' application, usable either by a
laryngectomized patient or for silent communication. The system
is built around an audio--visual dictionary which associates
visual to acoustic observations for each phonetic class. Visual
features are extracted from ultrasound images of the tongue and
from video images of the lips using a PCA-based image coding
technique. Visual observations of each phonetic class are modeled
by continuous HMMs. The system then combines a phone recognition
stage with corpus-based synthesis. In the recognition stage, the
visual HMMs are used to identify phonetic targets in a sequence
of visual features. In the synthesis stage, these phonetic
targets constrain the dictionary search for the sequence of
diphones that maximizes similarity to the input test data in the
visual space, subject to a concatenation cost in the acoustic
domain. A prosody-template is extracted from the training corpus,
and the final speech waveform is generated using ``Harmonic plus
Noise Model'' concatenative synthesis techniques. Experimental
results are based on an audiovisual database containing 1h of
continuous speech from each of two speakers.
@article{Hueber2010-cm,
abstract = {This article presents a segmental vocoder driven by ultrasound
and optical images (standard CCD camera) of the tongue and lips
for a ``silent speech interface'' application, usable either by a
laryngectomized patient or for silent communication. The system
is built around an audio--visual dictionary which associates
visual to acoustic observations for each phonetic class. Visual
features are extracted from ultrasound images of the tongue and
from video images of the lips using a PCA-based image coding
technique. Visual observations of each phonetic class are modeled
by continuous HMMs. The system then combines a phone recognition
stage with corpus-based synthesis. In the recognition stage, the
visual HMMs are used to identify phonetic targets in a sequence
of visual features. In the synthesis stage, these phonetic
targets constrain the dictionary search for the sequence of
diphones that maximizes similarity to the input test data in the
visual space, subject to a concatenation cost in the acoustic
domain. A prosody-template is extracted from the training corpus,
and the final speech waveform is generated using ``Harmonic plus
Noise Model'' concatenative synthesis techniques. Experimental
results are based on an audiovisual database containing 1h of
continuous speech from each of two speakers.},
added-at = {2023-06-06T00:19:46.000+0200},
author = {Hueber, Thomas and Benaroya, Elie-Laurent and Chollet, G{\'e}rard and Denby, Bruce and Dreyfus, G{\'e}rard and Stone, Maureen},
biburl = {https://www.bibsonomy.org/bibtex/2861b93254e48c902cbdcfe92a69f2d3e/willwade},
interhash = {ad81f36f82321e65b8ef68f15cc35043},
intrahash = {861b93254e48c902cbdcfe92a69f2d3e},
journal = {Speech Commun.},
keywords = {Corpus-based Silent Ultrasound; Visual phone recognition;subvocal speech speech; synthesis;},
month = apr,
number = 4,
pages = {288--300},
timestamp = {2023-06-06T00:20:15.000+0200},
title = {Development of a silent speech interface driven by ultrasound and
optical images of the tongue and lips},
volume = 52,
year = 2010
}