Source separation is the task to separate an audio recording into individual
sound sources. Source separation is fundamental for computational auditory
scene analysis. Previous work on source separation has focused on separating
particular sound classes such as speech and music. Many of previous work
require mixture and clean source pairs for training. In this work, we propose a
source separation framework trained with weakly labelled data. Weakly labelled
data only contains the tags of an audio clip, without the occurrence time of
sound events. We first train a sound event detection system with AudioSet. The
trained sound event detection system is used to detect segments that are mostly
like to contain a target sound event. Then a regression is learnt from a
mixture of two randomly selected segments to a target segment conditioned on
the audio tagging prediction of the target segment. Our proposed system can
separate 527 kinds of sound classes from AudioSet within a single system. A
U-Net is adopted for the separation system and achieves an average SDR of 5.67
dB over 527 sound classes in AudioSet.
%0 Generic
%1 kong2020source
%A Kong, Qiuqiang
%A Wang, Yuxuan
%A Song, Xuchen
%A Cao, Yin
%A Wang, Wenwu
%A Plumbley, Mark D.
%D 2020
%K source_separation weak_supervision
%T Source separation with weakly labelled data: An approach to
computational auditory scene analysis
%U http://arxiv.org/abs/2002.02065
%X Source separation is the task to separate an audio recording into individual
sound sources. Source separation is fundamental for computational auditory
scene analysis. Previous work on source separation has focused on separating
particular sound classes such as speech and music. Many of previous work
require mixture and clean source pairs for training. In this work, we propose a
source separation framework trained with weakly labelled data. Weakly labelled
data only contains the tags of an audio clip, without the occurrence time of
sound events. We first train a sound event detection system with AudioSet. The
trained sound event detection system is used to detect segments that are mostly
like to contain a target sound event. Then a regression is learnt from a
mixture of two randomly selected segments to a target segment conditioned on
the audio tagging prediction of the target segment. Our proposed system can
separate 527 kinds of sound classes from AudioSet within a single system. A
U-Net is adopted for the separation system and achieves an average SDR of 5.67
dB over 527 sound classes in AudioSet.
@preprint{kong2020source,
abstract = {Source separation is the task to separate an audio recording into individual
sound sources. Source separation is fundamental for computational auditory
scene analysis. Previous work on source separation has focused on separating
particular sound classes such as speech and music. Many of previous work
require mixture and clean source pairs for training. In this work, we propose a
source separation framework trained with weakly labelled data. Weakly labelled
data only contains the tags of an audio clip, without the occurrence time of
sound events. We first train a sound event detection system with AudioSet. The
trained sound event detection system is used to detect segments that are mostly
like to contain a target sound event. Then a regression is learnt from a
mixture of two randomly selected segments to a target segment conditioned on
the audio tagging prediction of the target segment. Our proposed system can
separate 527 kinds of sound classes from AudioSet within a single system. A
U-Net is adopted for the separation system and achieves an average SDR of 5.67
dB over 527 sound classes in AudioSet.},
added-at = {2020-02-12T13:28:28.000+0100},
author = {Kong, Qiuqiang and Wang, Yuxuan and Song, Xuchen and Cao, Yin and Wang, Wenwu and Plumbley, Mark D.},
biburl = {https://www.bibsonomy.org/bibtex/2d771a9c2c3632f0a3cbb51d129ac59b1/topel},
interhash = {550321d13024420f34790b79b0b3986c},
intrahash = {d771a9c2c3632f0a3cbb51d129ac59b1},
keywords = {source_separation weak_supervision},
note = {cite arxiv:2002.02065Comment: 5 pages},
timestamp = {2020-02-12T13:28:28.000+0100},
title = {Source separation with weakly labelled data: An approach to
computational auditory scene analysis},
url = {http://arxiv.org/abs/2002.02065},
year = 2020
}