@article{13115,
  keywords = {GPU, performance modeling, 3D stencil methods},
  author = {Huayou Su and Xing Cai and Mei Wen and Chunyuan Zhang},
  title = {An Analytical GPU Performance Model for 3D Stencil Computations from the Angle of Data Traffic},
  abstract = {The achievable GPU performance of many scientific computations is not determined by a GPU{\textquoteright}s peak floating-point rate, but rather how fast data are moved through different stages of the entire memory hierarchy. We take low-order 3D stencil computations as a representative class to study the reachable GPU performance from the angle of data traffic. Specifically, we propose a simple analytical model to estimate the execution time based on quantifying the data traffic volume at three stages: (1) between registers and on-SMX storage, (2) between on-SMX storage and L2 cache, (3) between L2 cache and GPU{\textquoteright}s device memory. Three associated granularities are used: a CUDA thread, a thread block, and a set of simultaneously active thread blocks. For four 3D stencil computations, NVIDIA{\textquoteright}s profiling tools have been used to verify the accuracy of the quantified data traffic volumes, by trying a large number of executions with different problem sizes and thread block configurations. Moreover, by introducing an imbalance coefficient, together with the known realistic memory bandwidths, we can predict the execution time usage based on the quantified data traffic volumes. For the four 3D stencils, the average error of the time prediction is 6.9\% for a baseline implementation approach, whereas for a blocking implementation approach the average prediction error is 9.5\%.},
  year = {2015},
  journal = {The Journal of Supercomputing},
  volume = {71},
  pages = {2433-2453},
  month = {02/2015},
  publisher = {Springer},
  issn = {0920-8542},
  url = {http://link.springer.com/article/10.1007/s11227-015-1392-1},
  doi = {10.1007/s11227-015-1392-1},
}