@misc{15755, author = {Kristian Hustad and Xing Cai and Johannes Langguth and Hermenegild Arevalo}, title = {Efficient simulations of patient-specific electrical heart activity on the DGX-2}, abstract = {Patients who have suffered a heart attack have an elevated risk of developing arrhythmia. The use of computer simulations of the electrical activity in the hearts of these patients, is emerging as an alternative to traditional, more invasive examinations performed by doctors today. Recent advances in personalised arrhythmia risk prediction show that computational models can provide not only safer but also more accurate results than invasive procedures. However, biophysically accurate simulations of the electrical activity in the heart require solving linear systems over fine meshes and time resolutions, which can take hours or even days. This limits the use of such simulations in the clinic where diagnosis and treatment planning can be time sensitive, even if it is just for the reason of operation schedules. Furthermore, the non-interactive, non-intuitive way of accessing simulations and their results makes it hard to study these collaboratively. Overcoming these limitations requires speeding up computations from hours to seconds, which requires a massive increase in computational capabilities.We have developed a code that is capable of performing highly efficient heart simulations on the DGX-2, making use of all 16 V100 GPUs. Using a patient-specific unstructured tetrahedral mesh with 11.7 million cells, we are able to simulate the electrical heart activity at 1/30 of real-time. Moreover, we are able to show that the throughput achieved using all 16 GPUs in the DGX-2 is 77.6\% of the theoretical maximum.We achieved this through extensive optimisations of the two kernels constituting the body of the main loop in the simulator. In the kernel solving the diffusion equation (governing the spread of the electrical signal), constituting of a sparse matrix-vector multiplication, we minimise the memory traffic by reordering the mesh (and matrix) elements into clusters that fit in the V100{\textquoteright}s L2 cache. In the kernel solving the cell model (describing the complex interactions of ion channels in the cell membrane), we apply sophisticated domain-specific optimisations to reduce the number of floating point operations to the point where the kernel becomes memory bound. After optimisation, both kernels are memory bound, and we derive the minimum memory traffic, which we then divide by the aggregate memory bandwidth to obtain a lower bound on the execution time.Topics discussed include optimisations for sparse matrix-vector multiplications, strategies for handling inter-device communication for unstructured meshes, and lessons we learnt while programming the DGX-2.}, year = {2020}, month = {03/2020}, publisher = {Nvidia}, address = {GPU Technology Conference (GTC) 2020, Silicon Valley, USA}, }