Video snapshot compressive imaging (SCI) captures multiple sequential video frames by a single measurement using the idea of computational imaging. The underlying principle is to modulate high-speed frames through different masks and these modulated frames are summed to a single measurement captured by a low-speed 2D sensor (dubbed optical encoder); following this, algorithms are employed to reconstruct the desired high-speed frames (dubbed software decoder) if needed. In this article, we consider the reconstruction algorithm in video SCI, i.e., recovering a series of video frames from a compressed measurement. Specifically, we propose a Spatial-Temporal transFormer (STFormer) to exploit the correlation in both spatial and temporal domains. STFormer network is composed of a token generation block, a video reconstruction block, and these two blocks are connected by a series of STFormer blocks. Each STFormer block consists of a spatial self-attention branch, a temporal self-attention branch and the outputs of these two branches are integrated by a fusion network. Extensive results on both simulated and real data demonstrate the state-of-the-art performance of STFormer. The code and models are publicly available at https://github.com/ucaswangls/STFormer.
%0 Journal Article
%1 stformer
%A Wang, L.
%A Cao, M.
%A Zhong, Y.
%A Yuan, X.
%C Los Alamitos, CA, USA
%D 2023
%I IEEE Computer Society
%J IEEE Transactions on Pattern Analysis and Machine Intelligence
%K analysis cameras correlation-color-reconstruction gray-scale image image-transformers task-analysis
%N 07
%P 9072-9089
%R 10.1109/TPAMI.2022.3225382
%T Spatial-Temporal Transformer for Video Snapshot Compressive Imaging
%V 45
%X Video snapshot compressive imaging (SCI) captures multiple sequential video frames by a single measurement using the idea of computational imaging. The underlying principle is to modulate high-speed frames through different masks and these modulated frames are summed to a single measurement captured by a low-speed 2D sensor (dubbed optical encoder); following this, algorithms are employed to reconstruct the desired high-speed frames (dubbed software decoder) if needed. In this article, we consider the reconstruction algorithm in video SCI, i.e., recovering a series of video frames from a compressed measurement. Specifically, we propose a Spatial-Temporal transFormer (STFormer) to exploit the correlation in both spatial and temporal domains. STFormer network is composed of a token generation block, a video reconstruction block, and these two blocks are connected by a series of STFormer blocks. Each STFormer block consists of a spatial self-attention branch, a temporal self-attention branch and the outputs of these two branches are integrated by a fusion network. Extensive results on both simulated and real data demonstrate the state-of-the-art performance of STFormer. The code and models are publicly available at https://github.com/ucaswangls/STFormer.
@article{stformer,
abstract = {Video snapshot compressive imaging (SCI) captures multiple sequential video frames by a single measurement using the idea of computational imaging. The underlying principle is to modulate high-speed frames through different masks and these modulated frames are summed to a single measurement captured by a low-speed 2D sensor (dubbed optical encoder); following this, algorithms are employed to reconstruct the desired high-speed frames (dubbed software decoder) if needed. In this article, we consider the reconstruction algorithm in video SCI, i.e., recovering a series of video frames from a compressed measurement. Specifically, we propose a Spatial-Temporal transFormer (STFormer) to exploit the correlation in both spatial and temporal domains. STFormer network is composed of a token generation block, a video reconstruction block, and these two blocks are connected by a series of STFormer blocks. Each STFormer block consists of a spatial self-attention branch, a temporal self-attention branch and the outputs of these two branches are integrated by a fusion network. Extensive results on both simulated and real data demonstrate the state-of-the-art performance of STFormer. The code and models are publicly available at https://github.com/ucaswangls/STFormer.},
added-at = {2023-06-17T08:56:14.000+0200},
address = {Los Alamitos, CA, USA},
author = {Wang, L. and Cao, M. and Zhong, Y. and Yuan, X.},
biburl = {https://www.bibsonomy.org/bibtex/2702d93d52fdb8ec6ded03a5954780096/andolab},
doi = {10.1109/TPAMI.2022.3225382},
interhash = {5514f3d88e7dd814a563eb3ce0f0b74d},
intrahash = {702d93d52fdb8ec6ded03a5954780096},
issn = {1939-3539},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
keywords = {analysis cameras correlation-color-reconstruction gray-scale image image-transformers task-analysis},
month = jul,
number = 07,
pages = {9072-9089},
publisher = {IEEE Computer Society},
timestamp = {2023-09-30T15:57:35.000+0200},
title = {Spatial-Temporal Transformer for Video Snapshot Compressive Imaging},
volume = 45,
year = 2023
}