@misc{13369, keywords = {GPU, CUDA, stencil, MPI, CPU+GPU computing}, author = {Mohammed Sourouri and Johannes Langguth and Filippo Spiga and Scott Baden and Xing Cai}, title = {CPU+GPU Programming of Stencil Computations for Resource-Efficient Use of GPU Clusters}, abstract = {On modern GPU clusters, the role of the CPUs is often restricted to controlling the GPUs and handling MPI communication. The unused computing power of the CPUs, however, can be considerable for computations whose performance is bounded by memory traffic. This paper investigates the challenges of simultaneous usage of CPUs and GPUs for computation. Our emphasis is on deriving a heterogeneous CPU+GPU programming approach that combines MPI, OpenMP and CUDA. To effectively hide the overhead of various inter- and intra-node communications, a new level of task parallelism is introduced on top of the conventional data parallelism. Combined with a suitable workload division between the CPUs and GPUs, our CPU+GPU programming approach is able to fully utilize the different processing units. The programming details and achievable performance are exemplified by a widely used 3D 7-point stencil computation, which shows high performance and scaling in experiments using up to 64 CPU-GPU nodes.}, year = {2015}, journal = {IEEE 18th International Conference on Computational Science and Engineering}, pages = {17-26}, month = {10/2015}, publisher = {IEEE Computer Society}, doi = {http://dx.doi.org/10.1109/CSE.2015.33}, }