We study the performance portability of OpenCL across diverse
architectures including NVIDIA GPU, Intel Ivy Bridge CPU, and
AMD Fusion APU. We present detailed performance analysis at
assembly level on three exemplar OpenCL benchmarks: SGEMM, SpMV,
and FFT. We also identify a number of tuning knobs that are
critical to performance portability, including threads-data
mapping, data layout, tiling size, data caching, and
operation-specific factors. We further demonstrate that proper
tuning could improve the OpenCL portable performance from the
current 15\% to a potential 67\% of the state-of-the-art
performance on the Ivy Bridge CPU. Finally, we evaluate the
current OpenCL programming model, and propose a list of
extensions that improve performance portability.
%0 Book Section
%1 Zhang2013-iw
%A Zhang, Yao
%A Ii, Mark Sinclair
%A Chien, Andrew A
%B Supercomputing
%D 2013
%I Springer Berlin Heidelberg
%K Expose Performance_portability
%P 136--150
%T Improving Performance Portability in OpenCL Programs
%X We study the performance portability of OpenCL across diverse
architectures including NVIDIA GPU, Intel Ivy Bridge CPU, and
AMD Fusion APU. We present detailed performance analysis at
assembly level on three exemplar OpenCL benchmarks: SGEMM, SpMV,
and FFT. We also identify a number of tuning knobs that are
critical to performance portability, including threads-data
mapping, data layout, tiling size, data caching, and
operation-specific factors. We further demonstrate that proper
tuning could improve the OpenCL portable performance from the
current 15\% to a potential 67\% of the state-of-the-art
performance on the Ivy Bridge CPU. Finally, we evaluate the
current OpenCL programming model, and propose a list of
extensions that improve performance portability.
@incollection{Zhang2013-iw,
abstract = {We study the performance portability of OpenCL across diverse
architectures including NVIDIA GPU, Intel Ivy Bridge CPU, and
AMD Fusion APU. We present detailed performance analysis at
assembly level on three exemplar OpenCL benchmarks: SGEMM, SpMV,
and FFT. We also identify a number of tuning knobs that are
critical to performance portability, including threads-data
mapping, data layout, tiling size, data caching, and
operation-specific factors. We further demonstrate that proper
tuning could improve the OpenCL portable performance from the
current 15\% to a potential 67\% of the state-of-the-art
performance on the Ivy Bridge CPU. Finally, we evaluate the
current OpenCL programming model, and propose a list of
extensions that improve performance portability.},
added-at = {2015-06-08T22:32:54.000+0200},
author = {Zhang, Yao and Ii, Mark Sinclair and Chien, Andrew A},
biburl = {https://www.bibsonomy.org/bibtex/2935d2fd1225e10767988e0e4b15760ba/christophv},
booktitle = {Supercomputing},
interhash = {802b4362cd6b34d400c6fcaf2f6e0af6},
intrahash = {935d2fd1225e10767988e0e4b15760ba},
keywords = {Expose Performance_portability},
pages = {136--150},
publisher = {Springer Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
timestamp = {2016-01-04T14:22:08.000+0100},
title = {Improving Performance Portability in {OpenCL} Programs},
year = 2013
}