When adapting GPU-specific OpenCL kernels to run on
multi-core/many-core CPUs, coarsening the thread granularity is
necessary and thus extensively used. However, locality concerns
exposed in GPU-specific OpenCL code are usually inherited
without analysis, which may give side-effects on the CPU
performance. When executing GPU-specific kernels on CPUs,
local-memory arrays no longer match well with the hardware and
the associated synchronizations are costly. To solve this
dilemma, we actively analyze the memory access patterns by using
array-access descriptors derived from GPU-specific kernels,
which can thus be adapted for CPUs by removing all the unwanted
local-memory arrays together with the obsolete barrier
statements. Experiments show that the automated transformation
can satisfactorily improve OpenCL kernel performances on Sandy
Bridge CPU and Intel's Many-Integrated-Core coprocessor.
%0 Book Section
%1 Huang2014-lv
%A Huang, Dafei
%A Wen, Mei
%A Xun, Changqing
%A Chen, Dong
%A Cai, Xing
%A Qiao, Yuran
%A Wu, Nan
%A Zhang, Chunyuan
%B Euro-Par 2014 Parallel Processing
%D 2014
%I Springer International Publishing
%K Expose Memory_access Memory_hierarchy OpenCL
%P 210--221
%T Automated Transformation of GPU-Specific OpenCL Kernels
Targeting Performance Portability on Multi-Core/Many-Core
CPUs
%X When adapting GPU-specific OpenCL kernels to run on
multi-core/many-core CPUs, coarsening the thread granularity is
necessary and thus extensively used. However, locality concerns
exposed in GPU-specific OpenCL code are usually inherited
without analysis, which may give side-effects on the CPU
performance. When executing GPU-specific kernels on CPUs,
local-memory arrays no longer match well with the hardware and
the associated synchronizations are costly. To solve this
dilemma, we actively analyze the memory access patterns by using
array-access descriptors derived from GPU-specific kernels,
which can thus be adapted for CPUs by removing all the unwanted
local-memory arrays together with the obsolete barrier
statements. Experiments show that the automated transformation
can satisfactorily improve OpenCL kernel performances on Sandy
Bridge CPU and Intel's Many-Integrated-Core coprocessor.
@incollection{Huang2014-lv,
abstract = {When adapting GPU-specific OpenCL kernels to run on
multi-core/many-core CPUs, coarsening the thread granularity is
necessary and thus extensively used. However, locality concerns
exposed in GPU-specific OpenCL code are usually inherited
without analysis, which may give side-effects on the CPU
performance. When executing GPU-specific kernels on CPUs,
local-memory arrays no longer match well with the hardware and
the associated synchronizations are costly. To solve this
dilemma, we actively analyze the memory access patterns by using
array-access descriptors derived from GPU-specific kernels,
which can thus be adapted for CPUs by removing all the unwanted
local-memory arrays together with the obsolete barrier
statements. Experiments show that the automated transformation
can satisfactorily improve OpenCL kernel performances on Sandy
Bridge CPU and Intel's Many-Integrated-Core coprocessor.},
added-at = {2015-04-10T18:02:47.000+0200},
author = {Huang, Dafei and Wen, Mei and Xun, Changqing and Chen, Dong and Cai, Xing and Qiao, Yuran and Wu, Nan and Zhang, Chunyuan},
biburl = {https://www.bibsonomy.org/bibtex/2fccf7487631bc118042781a584a26857/christophv},
booktitle = {{Euro-Par} 2014 Parallel Processing},
interhash = {c2c3eb9c6efdc9cd8a9ba38f76cf9893},
intrahash = {fccf7487631bc118042781a584a26857},
keywords = {Expose Memory_access Memory_hierarchy OpenCL},
pages = {210--221},
publisher = {Springer International Publishing},
series = {Lecture Notes in Computer Science},
timestamp = {2016-01-04T14:22:08.000+0100},
title = {Automated Transformation of {GPU-Specific} {OpenCL} Kernels
Targeting Performance Portability on {Multi-Core/Many-Core}
{CPUs}},
year = 2014
}