In this paper we propose a novel model for unconditional audio generation
based on generating one audio sample at a time. We show that our model, which
profits from combining memory-less modules, namely autoregressive multilayer
perceptrons, and stateful recurrent neural networks in a hierarchical structure
is able to capture underlying sources of variations in the temporal sequences
over very long time spans, on three datasets of different nature. Human
evaluation on the generated samples indicate that our model is preferred over
competing models. We also show how each component of the model contributes to
the exhibited performance.
Описание
[1612.07837] SampleRNN: An Unconditional End-to-End Neural Audio Generation Model
%0 Generic
%1 mehri2016samplernn
%A Mehri, Soroush
%A Kumar, Kundan
%A Gulrajani, Ishaan
%A Kumar, Rithesh
%A Jain, Shubham
%A Sotelo, Jose
%A Courville, Aaron
%A Bengio, Yoshua
%D 2016
%K dnn rnn speech speech-synthesis
%T SampleRNN: An Unconditional End-to-End Neural Audio Generation Model
%U http://arxiv.org/abs/1612.07837
%X In this paper we propose a novel model for unconditional audio generation
based on generating one audio sample at a time. We show that our model, which
profits from combining memory-less modules, namely autoregressive multilayer
perceptrons, and stateful recurrent neural networks in a hierarchical structure
is able to capture underlying sources of variations in the temporal sequences
over very long time spans, on three datasets of different nature. Human
evaluation on the generated samples indicate that our model is preferred over
competing models. We also show how each component of the model contributes to
the exhibited performance.
@misc{mehri2016samplernn,
abstract = {In this paper we propose a novel model for unconditional audio generation
based on generating one audio sample at a time. We show that our model, which
profits from combining memory-less modules, namely autoregressive multilayer
perceptrons, and stateful recurrent neural networks in a hierarchical structure
is able to capture underlying sources of variations in the temporal sequences
over very long time spans, on three datasets of different nature. Human
evaluation on the generated samples indicate that our model is preferred over
competing models. We also show how each component of the model contributes to
the exhibited performance.},
added-at = {2018-10-01T15:29:35.000+0200},
author = {Mehri, Soroush and Kumar, Kundan and Gulrajani, Ishaan and Kumar, Rithesh and Jain, Shubham and Sotelo, Jose and Courville, Aaron and Bengio, Yoshua},
biburl = {https://www.bibsonomy.org/bibtex/25bc72d74bd81f571b1b08a0c699e5c03/geistgesicht},
description = {[1612.07837] SampleRNN: An Unconditional End-to-End Neural Audio Generation Model},
interhash = {3b4bfced373e7a8516696b8627f3ecbc},
intrahash = {5bc72d74bd81f571b1b08a0c699e5c03},
keywords = {dnn rnn speech speech-synthesis},
note = {cite arxiv:1612.07837},
timestamp = {2018-10-01T15:29:35.000+0200},
title = {SampleRNN: An Unconditional End-to-End Neural Audio Generation Model},
url = {http://arxiv.org/abs/1612.07837},
year = 2016
}