Language models (LM) have grown with non-stop in the last decade, from
sequence-to-sequence architectures to the state-of-the-art and utter
attention-based Transformers. In this work, we demonstrate how the inclusion of
deep generative models within BERT can bring more versatile models, able to
impute missing/noisy words with richer text or even improve BLEU score. More
precisely, we use a Gaussian Mixture Variational Autoencoder (GMVAE) as a
regularizer layer and prove its effectiveness not only in Transformers but also
in the most relevant encoder-decoder based LM, seq2seq with and without
attention.
Description
Regularizing Transformers With Deep Probabilistic Layers
%0 Generic
%1 aguilera2021regularizing
%A Aguilera, Aurora Cobo
%A Olmos, Pablo Martínez
%A Artés-Rodríguez, Antonio
%A Pérez-Cruz, Fernando
%D 2021
%K neural-network nlp
%T Regularizing Transformers With Deep Probabilistic Layers
%U http://arxiv.org/abs/2108.10764
%X Language models (LM) have grown with non-stop in the last decade, from
sequence-to-sequence architectures to the state-of-the-art and utter
attention-based Transformers. In this work, we demonstrate how the inclusion of
deep generative models within BERT can bring more versatile models, able to
impute missing/noisy words with richer text or even improve BLEU score. More
precisely, we use a Gaussian Mixture Variational Autoencoder (GMVAE) as a
regularizer layer and prove its effectiveness not only in Transformers but also
in the most relevant encoder-decoder based LM, seq2seq with and without
attention.
@misc{aguilera2021regularizing,
abstract = {Language models (LM) have grown with non-stop in the last decade, from
sequence-to-sequence architectures to the state-of-the-art and utter
attention-based Transformers. In this work, we demonstrate how the inclusion of
deep generative models within BERT can bring more versatile models, able to
impute missing/noisy words with richer text or even improve BLEU score. More
precisely, we use a Gaussian Mixture Variational Autoencoder (GMVAE) as a
regularizer layer and prove its effectiveness not only in Transformers but also
in the most relevant encoder-decoder based LM, seq2seq with and without
attention.},
added-at = {2021-10-13T21:08:53.000+0200},
author = {Aguilera, Aurora Cobo and Olmos, Pablo Martínez and Artés-Rodríguez, Antonio and Pérez-Cruz, Fernando},
biburl = {https://www.bibsonomy.org/bibtex/27c5abea4386373af44b007b941c881dd/stdiff},
description = {Regularizing Transformers With Deep Probabilistic Layers},
interhash = {0e0fdea2066e7d0bdfc948e537849aa8},
intrahash = {7c5abea4386373af44b007b941c881dd},
keywords = {neural-network nlp},
note = {cite arxiv:2108.10764},
timestamp = {2021-10-13T21:08:53.000+0200},
title = {Regularizing Transformers With Deep Probabilistic Layers},
url = {http://arxiv.org/abs/2108.10764},
year = 2021
}