For many years, i-vector based audio embedding techniques were the dominant
approach for speaker verification and speaker diarization applications.
However, mirroring the rise of deep learning in various domains, neural network
based audio embeddings, also known as d-vectors, have consistently demonstrated
superior speaker verification performance. In this paper, we build on the
success of d-vector based speaker verification systems to develop a new
d-vector based approach to speaker diarization. Specifically, we combine
LSTM-based d-vector audio embeddings with recent work in non-parametric
clustering to obtain a state-of-the-art speaker diarization system. Our system
is evaluated on three standard public datasets, suggesting that d-vector based
diarization systems offer significant advantages over traditional i-vector
based systems. We achieved a 12.0% diarization error rate on NIST SRE 2000
CALLHOME, while our model is trained with out-of-domain data from voice search
logs.
%0 Generic
%1 wang2017speaker
%A Wang, Quan
%A Downey, Carlton
%A Wan, Li
%A Mansfield, Philip Andrew
%A Moreno, Ignacio Lopez
%D 2017
%K DNN LSTM audio diarization speech
%T Speaker Diarization with LSTM
%U http://arxiv.org/abs/1710.10468
%X For many years, i-vector based audio embedding techniques were the dominant
approach for speaker verification and speaker diarization applications.
However, mirroring the rise of deep learning in various domains, neural network
based audio embeddings, also known as d-vectors, have consistently demonstrated
superior speaker verification performance. In this paper, we build on the
success of d-vector based speaker verification systems to develop a new
d-vector based approach to speaker diarization. Specifically, we combine
LSTM-based d-vector audio embeddings with recent work in non-parametric
clustering to obtain a state-of-the-art speaker diarization system. Our system
is evaluated on three standard public datasets, suggesting that d-vector based
diarization systems offer significant advantages over traditional i-vector
based systems. We achieved a 12.0% diarization error rate on NIST SRE 2000
CALLHOME, while our model is trained with out-of-domain data from voice search
logs.
@misc{wang2017speaker,
abstract = {For many years, i-vector based audio embedding techniques were the dominant
approach for speaker verification and speaker diarization applications.
However, mirroring the rise of deep learning in various domains, neural network
based audio embeddings, also known as d-vectors, have consistently demonstrated
superior speaker verification performance. In this paper, we build on the
success of d-vector based speaker verification systems to develop a new
d-vector based approach to speaker diarization. Specifically, we combine
LSTM-based d-vector audio embeddings with recent work in non-parametric
clustering to obtain a state-of-the-art speaker diarization system. Our system
is evaluated on three standard public datasets, suggesting that d-vector based
diarization systems offer significant advantages over traditional i-vector
based systems. We achieved a 12.0% diarization error rate on NIST SRE 2000
CALLHOME, while our model is trained with out-of-domain data from voice search
logs.},
added-at = {2019-07-09T20:51:32.000+0200},
author = {Wang, Quan and Downey, Carlton and Wan, Li and Mansfield, Philip Andrew and Moreno, Ignacio Lopez},
biburl = {https://www.bibsonomy.org/bibtex/29580a6305e12577cd0788ce009ff724d/conscious_droid},
description = { 12.0% DER on NIST SRE 2000,CALLHOME
},
interhash = {d412f5fa80438da6bce06ddd857767ff},
intrahash = {9580a6305e12577cd0788ce009ff724d},
keywords = {DNN LSTM audio diarization speech},
note = {cite arxiv:1710.10468Comment: Published at ICASSP 2018},
timestamp = {2019-07-09T20:52:51.000+0200},
title = {Speaker Diarization with LSTM},
url = {http://arxiv.org/abs/1710.10468},
year = 2017
}