@inproceedings{conf/globalsip/ArndtLB15,
  added-at = {2021-10-14T00:00:00.000+0200},
  author = {Arndt, Oliver Jakob and Linde, Tobias and Blume, Holger},
  biburl = {https://www.bibsonomy.org/bibtex/2392ad24a8ccc8a154869c161751456ec/dblp},
  booktitle = {GlobalSIP},
  crossref = {conf/globalsip/2015},
  ee = {https://doi.org/10.1109/GlobalSIP.2015.7418429},
  interhash = {3050741be62ef53a00f2f32982acb855},
  intrahash = {392ad24a8ccc8a154869c161751456ec},
  isbn = {978-1-4799-7591-4},
  keywords = {dblp},
  pages = {1402-1406},
  publisher = {IEEE},
  timestamp = {2024-04-10T13:18:06.000+0200},
  title = {Implementation and analysis of the histograms of oriented Gradients algorithm on a heterogeneous multicore CPU/GPU architecture.},
  url = {http://dblp.uni-trier.de/db/conf/globalsip/globalsip2015.html#ArndtLB15},
  year = 2015
}

@inproceedings{7418429,
  abstract = {Due to the integration of multiple heterogeneous processing units on a single die, programmers can make use of processors with various features. For instance, the Samsung Exynos 5 Octa mobile processor features two ARM CPU clusters (Cortex-A7/A15), a mobile Mali GPU, and a dedicated image processor (codec). However, data transfer delays and missing data coherencies between clusters complicate heterogeneous programming. Programs should not only scale over multiple cores, but also distribute the work over heterogeneous processing units. Depending on the algorithm and platform characteristics, the selection of a proper partitioning scheme appears as a challenging task. In this work, we present a heterogeneous implementation of the Histograms of Oriented Gradients algorithm as a case study, which is a key algorithm in the field of driver assistance systems. The implementation is targeted on the CPU-clusters and the GPU of the Samsung Exynos 5 Octa 5422. In order to generate the best partitioning scheme, we specifically discuss different strategies. Therefore, we analyze the computational capabilities as well as the power consumption of the individual processing units using different algorithmic processing stages. We show that a GPU-only execution slows down the computation compared with the CPU-only version, while mapping to both devices (CPU and GPU) achieves a speedup of 1.68.},
  added-at = {2016-04-20T12:51:23.000+0200},
  author = {Arndt, O. J. and Linde, T. and Blume, H.},
  biburl = {https://www.bibsonomy.org/bibtex/22ba183794495fbca9a1a121b6bca7b28/imsl3s},
  booktitle = {2015 IEEE Global Conference on Signal and Information Processing (GlobalSIP)},
  description = {IEEE Xplore Abstract - Implementation and analysis of the histograms of oriented Gradients algorithm on a heterogeneous mul...},
  doi = {10.1109/GlobalSIP.2015.7418429},
  interhash = {3050741be62ef53a00f2f32982acb855},
  intrahash = {2ba183794495fbca9a1a121b6bca7b28},
  keywords = {2015 SoC heterogeneous multicore myown programming},
  month = dec,
  pages = {1402-1406},
  timestamp = {2016-04-20T13:00:40.000+0200},
  title = {Implementation and analysis of the histograms of oriented Gradients algorithm on a heterogeneous multicore CPU/GPU architecture},
  url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=7418429},
  year = 2015
}