The rising popularity of intelligent mobile devices and the daunting computational cost of deep learning-based models call for efficient and accurate on-device inference schemes. We propose a quantization scheme that allows inference to be carried out using integer-only arithmetic, which can be implemented more efficiently than floating point inference on commonly available integer-only hardware. We also co-design a training procedure to preserve end-to-end model accuracy post quantization. As a result, the proposed quantization scheme improves the tradeoff between accuracy and on-device latency. The improvements are significant even on MobileNets, a model family known for run-time efficiency, and are demonstrated in ImageNet classification and COCO detection on popular CPUs.
Description
Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference - IEEE Conference Publication
%0 Conference Paper
%1 8578384
%A Jacob, B.
%A Kligys, S.
%A Chen, B.
%A Zhu, M.
%A Tang, M.
%A Howard, A.
%A Adam, H.
%A Kalenichenko, D.
%B 2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition
%D 2018
%K convnets dnn quantization
%P 2704-2713
%R 10.1109/CVPR.2018.00286
%T Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference
%U https://ieeexplore.ieee.org/document/8578384
%X The rising popularity of intelligent mobile devices and the daunting computational cost of deep learning-based models call for efficient and accurate on-device inference schemes. We propose a quantization scheme that allows inference to be carried out using integer-only arithmetic, which can be implemented more efficiently than floating point inference on commonly available integer-only hardware. We also co-design a training procedure to preserve end-to-end model accuracy post quantization. As a result, the proposed quantization scheme improves the tradeoff between accuracy and on-device latency. The improvements are significant even on MobileNets, a model family known for run-time efficiency, and are demonstrated in ImageNet classification and COCO detection on popular CPUs.
@inproceedings{8578384,
abstract = {The rising popularity of intelligent mobile devices and the daunting computational cost of deep learning-based models call for efficient and accurate on-device inference schemes. We propose a quantization scheme that allows inference to be carried out using integer-only arithmetic, which can be implemented more efficiently than floating point inference on commonly available integer-only hardware. We also co-design a training procedure to preserve end-to-end model accuracy post quantization. As a result, the proposed quantization scheme improves the tradeoff between accuracy and on-device latency. The improvements are significant even on MobileNets, a model family known for run-time efficiency, and are demonstrated in ImageNet classification and COCO detection on popular CPUs.},
added-at = {2020-10-23T21:03:58.000+0200},
author = {{Jacob}, B. and {Kligys}, S. and {Chen}, B. and {Zhu}, M. and {Tang}, M. and {Howard}, A. and {Adam}, H. and {Kalenichenko}, D.},
biburl = {https://www.bibsonomy.org/bibtex/262a0538e9b21d2a374a33133a4043e89/sohnki},
booktitle = {2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition},
description = {Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference - IEEE Conference Publication},
doi = {10.1109/CVPR.2018.00286},
interhash = {2c9fd8218b7dacf6236524a3a826e98e},
intrahash = {62a0538e9b21d2a374a33133a4043e89},
issn = {2575-7075},
keywords = {convnets dnn quantization},
month = {June},
pages = {2704-2713},
timestamp = {2020-10-23T21:03:58.000+0200},
title = {Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference},
url = {https://ieeexplore.ieee.org/document/8578384},
year = 2018
}