@misc{8973, author = {Mohammed Sourouri and Tor Gillberg and Scott Baden and Xing Cai}, title = {Effective Multi-GPU Communication Using Multiple CUDA Streams and Threads}, abstract = {In the context of multiple GPUs that share the same PCIe bus, we propose a new communication scheme that leads to a more effective overlap of communication and computation. Multiple CUDA streams and OpenMP threads are adopted so that data can simultaneously be sent and received. A representative 3D stencil example is used to demonstrate the effectiveness of our scheme. We compare the performance of our new scheme with an MPI-based state-of-the-art scheme. Results show that our approach outperforms the state-of-the-art scheme, being up to 1.85{\texttimes} faster. However, our performance results also indicate that the current underlying PCIe bus architecture needs improvements to handle the future scenario of many GPUs per node.}, year = {2014}, journal = {20th International Conference on Parallel and Distributed Systems (ICPADS 2014)}, pages = {981-986}, publisher = {IEEE}, doi = {10.1109/PADSW.2014.7097919}, }