Data warehousing applications represent an emergent application
arena that requires the processing of relational queries and
computations over massive amounts of data. Modern general
purpose GPUs are high core count architectures that potentially
offer substantial improvements in throughput for these
applications. However, there are significant challenges that
arise due to the overheads of data movement through the memory
hierarchy and between the GPU and host CPU. This paper proposes
a set of compiler optimizations to address these challenges.
Inspired in part by loop fusion/fission optimizations in the
scientific computing community, we propose kernel fusion and
kernel fission. Kernel fusion fuses the code bodies of two GPU
kernels to i) eliminate redundant operations across dependent
kernels, ii) reduce data movement between GPU registers and GPU
memory, iii) reduce data movement between GPU memory and CPU
memory, and iv) improve spatial and temporal locality of memory
references. Kernel fission partitions a kernel into segments
such that segment computations and data transfers between the
GPU and host CPU can be overlapped. Fusion and fission can also
be applied concurrently to a set of kernels. We empirically
evaluate the benefits of fusion/fission on relational algebra
operators drawn from the TPC-H benchmark suite. All kernels are
implemented in CUDA and the experiments are performed with
NVIDIA Fermi GPUs. In general, we observed data throughput
improvements ranging from 13.1\% to 41.4\% for the SELECT
operator and queries Q1 and Q21 in the TPC-H benchmark suite. We
present key insights, lessons learned, and opportunities for
further improvements.
%0 Conference Paper
%1 Wu2012-ui
%A Wu, Haicheng
%A Diamos, G
%A Wang, Jin
%A Cadambi, S
%A Yalamanchili, S
%A Chakradhar, S
%B Parallel and Distributed Processing Symposium Workshops PhD
Forum (IPDPSW), 2012 IEEE 26th International
%D 2012
%K Algorithm Bandwidth CPU_memory CUDA Efficient_query_execution Expose Fission Fusion GPU GPU_memory GPU_registers Graphics_processing_unit Kernel Memory_management NVIDIA_Fermi_GPU Optimization TPC-H TPC-H_benchmark_suite Throughput Warehousing compiler compiler_optimizations data_movement_reduction data_throughput_improvements data_transfers data_warehouses data_warehousing data_warehousing_applications general_purpose_GPU graphics_processing_unit graphics_processing_units kernel_fission kernel_fusion loop_fission_optimization loop_fusion_optimization memory_reference_spatial_locality memory_reference_temporal_locality optimising_compilers optimization parallel_architectures query_processing redundant_operation_elimination relational_algebra relational_algebra_operators relational_computation_processing relational_query_processing scientific_computing_community segment_computations storage_management
%P 2433--2442
%T Optimizing Data Warehousing Applications for GPUs Using Kernel
Fusion/Fission
%X Data warehousing applications represent an emergent application
arena that requires the processing of relational queries and
computations over massive amounts of data. Modern general
purpose GPUs are high core count architectures that potentially
offer substantial improvements in throughput for these
applications. However, there are significant challenges that
arise due to the overheads of data movement through the memory
hierarchy and between the GPU and host CPU. This paper proposes
a set of compiler optimizations to address these challenges.
Inspired in part by loop fusion/fission optimizations in the
scientific computing community, we propose kernel fusion and
kernel fission. Kernel fusion fuses the code bodies of two GPU
kernels to i) eliminate redundant operations across dependent
kernels, ii) reduce data movement between GPU registers and GPU
memory, iii) reduce data movement between GPU memory and CPU
memory, and iv) improve spatial and temporal locality of memory
references. Kernel fission partitions a kernel into segments
such that segment computations and data transfers between the
GPU and host CPU can be overlapped. Fusion and fission can also
be applied concurrently to a set of kernels. We empirically
evaluate the benefits of fusion/fission on relational algebra
operators drawn from the TPC-H benchmark suite. All kernels are
implemented in CUDA and the experiments are performed with
NVIDIA Fermi GPUs. In general, we observed data throughput
improvements ranging from 13.1\% to 41.4\% for the SELECT
operator and queries Q1 and Q21 in the TPC-H benchmark suite. We
present key insights, lessons learned, and opportunities for
further improvements.
@inproceedings{Wu2012-ui,
abstract = {Data warehousing applications represent an emergent application
arena that requires the processing of relational queries and
computations over massive amounts of data. Modern general
purpose GPUs are high core count architectures that potentially
offer substantial improvements in throughput for these
applications. However, there are significant challenges that
arise due to the overheads of data movement through the memory
hierarchy and between the GPU and host CPU. This paper proposes
a set of compiler optimizations to address these challenges.
Inspired in part by loop fusion/fission optimizations in the
scientific computing community, we propose kernel fusion and
kernel fission. Kernel fusion fuses the code bodies of two GPU
kernels to i) eliminate redundant operations across dependent
kernels, ii) reduce data movement between GPU registers and GPU
memory, iii) reduce data movement between GPU memory and CPU
memory, and iv) improve spatial and temporal locality of memory
references. Kernel fission partitions a kernel into segments
such that segment computations and data transfers between the
GPU and host CPU can be overlapped. Fusion and fission can also
be applied concurrently to a set of kernels. We empirically
evaluate the benefits of fusion/fission on relational algebra
operators drawn from the TPC-H benchmark suite. All kernels are
implemented in CUDA and the experiments are performed with
NVIDIA Fermi GPUs. In general, we observed data throughput
improvements ranging from 13.1\% to 41.4\% for the SELECT
operator and queries Q1 and Q21 in the TPC-H benchmark suite. We
present key insights, lessons learned, and opportunities for
further improvements.},
added-at = {2015-04-10T18:02:47.000+0200},
author = {Wu, Haicheng and Diamos, G and Wang, Jin and Cadambi, S and Yalamanchili, S and Chakradhar, S},
biburl = {https://www.bibsonomy.org/bibtex/2ac1203bd4a45dd7dc2a0b3cd2cd348bb/christophv},
booktitle = {Parallel and Distributed Processing Symposium Workshops {PhD}
Forum ({IPDPSW)}, 2012 {IEEE} 26th International},
interhash = {a2cdfa076c95e35ea872db1ffa4df20b},
intrahash = {ac1203bd4a45dd7dc2a0b3cd2cd348bb},
keywords = {Algorithm Bandwidth CPU_memory CUDA Efficient_query_execution Expose Fission Fusion GPU GPU_memory GPU_registers Graphics_processing_unit Kernel Memory_management NVIDIA_Fermi_GPU Optimization TPC-H TPC-H_benchmark_suite Throughput Warehousing compiler compiler_optimizations data_movement_reduction data_throughput_improvements data_transfers data_warehouses data_warehousing data_warehousing_applications general_purpose_GPU graphics_processing_unit graphics_processing_units kernel_fission kernel_fusion loop_fission_optimization loop_fusion_optimization memory_reference_spatial_locality memory_reference_temporal_locality optimising_compilers optimization parallel_architectures query_processing redundant_operation_elimination relational_algebra relational_algebra_operators relational_computation_processing relational_query_processing scientific_computing_community segment_computations storage_management},
month = may,
pages = {2433--2442},
timestamp = {2016-01-04T14:22:08.000+0100},
title = {Optimizing Data Warehousing Applications for {GPUs} Using Kernel
{Fusion/Fission}},
year = 2012
}