Text-to-image generation has traditionally focused on finding better modeling
assumptions for training on a fixed dataset. These assumptions might involve
complex architectures, auxiliary losses, or side information such as object
part labels or segmentation masks supplied during training. We describe a
simple approach for this task based on a transformer that autoregressively
models the text and image tokens as a single stream of data. With sufficient
data and scale, our approach is competitive with previous domain-specific
models when evaluated in a zero-shot fashion.
%0 Generic
%1 ramesh2021zeroshot
%A Ramesh, Aditya
%A Pavlov, Mikhail
%A Goh, Gabriel
%A Gray, Scott
%A Voss, Chelsea
%A Radford, Alec
%A Chen, Mark
%A Sutskever, Ilya
%D 2021
%K text2img zero_shot
%T Zero-Shot Text-to-Image Generation
%U http://arxiv.org/abs/2102.12092
%X Text-to-image generation has traditionally focused on finding better modeling
assumptions for training on a fixed dataset. These assumptions might involve
complex architectures, auxiliary losses, or side information such as object
part labels or segmentation masks supplied during training. We describe a
simple approach for this task based on a transformer that autoregressively
models the text and image tokens as a single stream of data. With sufficient
data and scale, our approach is competitive with previous domain-specific
models when evaluated in a zero-shot fashion.
@misc{ramesh2021zeroshot,
abstract = {Text-to-image generation has traditionally focused on finding better modeling
assumptions for training on a fixed dataset. These assumptions might involve
complex architectures, auxiliary losses, or side information such as object
part labels or segmentation masks supplied during training. We describe a
simple approach for this task based on a transformer that autoregressively
models the text and image tokens as a single stream of data. With sufficient
data and scale, our approach is competitive with previous domain-specific
models when evaluated in a zero-shot fashion.},
added-at = {2021-08-26T11:47:54.000+0200},
author = {Ramesh, Aditya and Pavlov, Mikhail and Goh, Gabriel and Gray, Scott and Voss, Chelsea and Radford, Alec and Chen, Mark and Sutskever, Ilya},
biburl = {https://www.bibsonomy.org/bibtex/26c32885130a622b0e41cdf23bb221de4/shuncheng.wu},
description = {Zero-Shot Text-to-Image Generation
},
interhash = {7e886e6f17b08ce7ee5614906e0e4599},
intrahash = {6c32885130a622b0e41cdf23bb221de4},
keywords = {text2img zero_shot},
note = {cite arxiv:2102.12092},
timestamp = {2021-08-26T11:47:54.000+0200},
title = {Zero-Shot Text-to-Image Generation},
url = {http://arxiv.org/abs/2102.12092},
year = 2021
}