@article{15800, keywords = {Cache simulation, Performance model, Sparse matrix{\textendash}vector multiplication, Intel Xeon, AMD Epyc}, author = {James Trotter and Johannes Langguth and Xing Cai}, title = {Cache simulation for irregular memory traffic on multi-core CPUs: Case study on performance models for sparse matrix{\textendash}vector multiplication}, abstract = {Parallel computations with irregular memory access patterns are often limited by the memory subsystems of multi-core CPUs, though it can be difficult to pinpoint and quantify performance bottlenecks precisely. We present a method for estimating volumes of data traffic caused by irregular, parallel computations on multi-core CPUs with memory hierarchies containing both private and shared caches. Further, we describe a performance model based on these estimates that applies to bandwidth-limited computations. As a case study, we consider two standard algorithms for sparse matrix{\textendash}vector multiplication, a widely used, irregular kernel. Using three different multi-core CPU systems and a set of matrices that induce a range of irregular memory access patterns, we demonstrate that our cache simulation combined with the proposed performance model accurately quantifies performance bottlenecks that would not be detected using standard best- or worst-case estimates of the data traffic volume.}, year = {2020}, journal = {Journal of Parallel and Distributed Computing}, volume = {144}, pages = {189--205}, month = {06/2020}, publisher = {Elsevier}, issn = {0743-7315}, url = {http://www.sciencedirect.com/science/article/pii/S0743731520302999}, doi = {https://doi.org/10.1016/j.jpdc.2020.05.020}, }