Transformer is a promising neural network learner, and has achieved great success in various machine learning tasks. Thanks to the recent prevalence of multimodal applications and big data, Transformer-based multimodal learning has become a hot topic in AI research. This paper presents a comprehensive survey of Transformer techniques oriented at multimodal data. The main contents of this survey include: (1) a background of multimodal learning, Transformer ecosystem, and the multimodal big data era, (2) a theoretical review of Vanilla Transformer, Vision Transformer, and multimodal Transformers, from a geometrically topological perspective, (3) a review of multimodal Transformer applications, via two important paradigms, i.e., for multimodal pretraining and for specific multimodal tasks, (4) a summary of the common challenges and designs shared by the multimodal Transformer models and applications, and (5) a discussion of open problems and potential research directions for the community.
arXiv.org Snapshot:/Users/pascal/Zotero/storage/HGZN5MS5/2206.html:text/html;Full Text PDF:/Users/pascal/Zotero/storage/SEBZCPJP/Xu et al. - 2023 - Multimodal Learning with Transformers A Survey.pdf:application/pdf
%0 Generic
%1 xu_multimodal_2023
%A Xu, Peng
%A Zhu, Xiatian
%A Clifton, David A.
%D 2023
%I arXiv
%K - Computer Learning Machine Pattern Recognition, Science Vision and ecomodelling
%T Multimodal Learning with Transformers: A Survey
%U http://arxiv.org/abs/2206.06488
%X Transformer is a promising neural network learner, and has achieved great success in various machine learning tasks. Thanks to the recent prevalence of multimodal applications and big data, Transformer-based multimodal learning has become a hot topic in AI research. This paper presents a comprehensive survey of Transformer techniques oriented at multimodal data. The main contents of this survey include: (1) a background of multimodal learning, Transformer ecosystem, and the multimodal big data era, (2) a theoretical review of Vanilla Transformer, Vision Transformer, and multimodal Transformers, from a geometrically topological perspective, (3) a review of multimodal Transformer applications, via two important paradigms, i.e., for multimodal pretraining and for specific multimodal tasks, (4) a summary of the common challenges and designs shared by the multimodal Transformer models and applications, and (5) a discussion of open problems and potential research directions for the community.
%Z Comment: This paper is accepted by IEEE TPAMI
@misc{xu_multimodal_2023,
abstract = {Transformer is a promising neural network learner, and has achieved great success in various machine learning tasks. Thanks to the recent prevalence of multimodal applications and big data, Transformer-based multimodal learning has become a hot topic in AI research. This paper presents a comprehensive survey of Transformer techniques oriented at multimodal data. The main contents of this survey include: (1) a background of multimodal learning, Transformer ecosystem, and the multimodal big data era, (2) a theoretical review of Vanilla Transformer, Vision Transformer, and multimodal Transformers, from a geometrically topological perspective, (3) a review of multimodal Transformer applications, via two important paradigms, i.e., for multimodal pretraining and for specific multimodal tasks, (4) a summary of the common challenges and designs shared by the multimodal Transformer models and applications, and (5) a discussion of open problems and potential research directions for the community.},
added-at = {2023-07-31T08:05:54.000+0200},
annote = {Comment: This paper is accepted by IEEE TPAMI},
author = {Xu, Peng and Zhu, Xiatian and Clifton, David A.},
biburl = {https://www.bibsonomy.org/bibtex/2f5d56723678ce0ef73eda22dd8c2d35b/jascal_panetzky},
file = {arXiv.org Snapshot:/Users/pascal/Zotero/storage/HGZN5MS5/2206.html:text/html;Full Text PDF:/Users/pascal/Zotero/storage/SEBZCPJP/Xu et al. - 2023 - Multimodal Learning with Transformers A Survey.pdf:application/pdf},
interhash = {ca9c92a3e88db61bb8f9f2ad57f13f96},
intrahash = {f5d56723678ce0ef73eda22dd8c2d35b},
keywords = {- Computer Learning Machine Pattern Recognition, Science Vision and ecomodelling},
month = may,
note = {arXiv:2206.06488 [cs]},
publisher = {arXiv},
shorttitle = {Multimodal {Learning} with {Transformers}},
timestamp = {2023-07-31T08:07:14.000+0200},
title = {Multimodal {Learning} with {Transformers}: {A} {Survey}},
url = {http://arxiv.org/abs/2206.06488},
urldate = {2023-07-10},
year = 2023
}