@misc {27210,
title = {Efficient simulations of patient-specific electrical heart activity on the DGX-2},
year = {2020},
month = {03/2020},
publisher = {Nvidia},
address = {GPU Technology Conference (GTC) 2020 Silicon Valley},
abstract = {Patients who have suffered a heart attack have an elevated risk of developing arrhythmia. The use of computer simulations of the electrical activity in the hearts of these patients, is emerging as an alternative to traditional, more invasive examinations performed by doctors today. Recent advances in personalised arrhythmia risk prediction show that computational models can provide not only safer but also more accurate results than invasive procedures. However, biophysically accurate simulations of the electrical activity in the heart require solving linear systems over fine meshes and time resolutions, which can take hours or even days. This limits the use of such simulations in the clinic where diagnosis and treatment planning can be time sensitive, even if it is just for the reason of operation schedules. Furthermore, the non-interactive, non-intuitive way of accessing simulations and their results makes it hard to study these collaboratively. Overcoming these limitations requires speeding up computations from hours to seconds, which requires a massive increase in computational capabilities.We have developed a code that is capable of performing highly efficient heart simulations on the DGX-2, making use of all 16 V100 GPUs. Using a patient-specific unstructured tetrahedral mesh with 11.7 million cells, we are able to simulate the electrical heart activity at 1/30 of real-time. Moreover, we are able to show that the throughput achieved using all 16 GPUs in the DGX-2 is 77.6\% of the theoretical maximum.We achieved this through extensive optimisations of the two kernels constituting the body of the main loop in the simulator. In the kernel solving the diffusion equation (governing the spread of the electrical signal), constituting of a sparse matrix-vector multiplication, we minimise the memory traffic by reordering the mesh (and matrix) elements into clusters that fit in the V100{\textquoteright}s L2 cache. In the kernel solving the cell model (describing the complex interactions of ion channels in the cell membrane), we apply sophisticated domain-specific optimisations to reduce the number of floating point operations to the point where the kernel becomes memory bound. After optimisation, both kernels are memory bound, and we derive the minimum memory traffic, which we then divide by the aggregate memory bandwidth to obtain a lower bound on the execution time.Topics discussed include optimisations for sparse matrix-vector multiplications, strategies for handling inter-device communication for unstructured meshes, and lessons we learnt while programming the DGX-2.},
author = {Kristian Gregorius {Hustad} and Cai, Xing and Langguth, Johannes and Arevalo, Hermenegild}
}
@inbook {27497,
title = {Operator Splitting and Finite Difference Schemes for Solving the EMI Model},
booktitle = {Modeling Excitable Tissue: The EMI Framework},
volume = {7},
year = {2020},
pages = {44 - 55},
publisher = {Springer International Publishing},
organization = {Springer International Publishing},
chapter = {4},
address = {Cham},
abstract = {We want to be able to perform accurate simulations of a large number of cardiac cells based on mathematical models where each individual cell is represented in the model. This implies that the computational mesh has to have a typical resolution of a few {\textmu}m leading to huge computational challenges. In this paper we use a certain operator splitting of the coupled equations and showthat this leads to systems that can be solved in parallel. This opens up for the possibility of simulating large numbers of coupled cardiac cells.},
isbn = {978-3-030-61156-9},
issn = {2512-1677},
doi = {10.1007/978-3-030-61157-6_4},
url = {http://link.springer.com/content/pdf/10.1007/978-3-030-61157-6_4},
author = {Karoline Horgmo {J{\ae}ger} and Kristian Gregorius {Hustad} and Cai, Xing and Tveito, Aslak},
editor = {Tveito, Aslak and Mardal, Kent-Andre and Marie E. {Rognes}}
}
@mastersthesis {27042,
title = {Solving the monodomain model efficiently on GPUs},
journal = {The University of Oslo},
year = {2019},
month = {09/2019},
pages = {117},
publisher = {Department of Informatics, University of Oslo},
abstract = {Patients who have suffered a myocardial infarction have an elevated risk of developing arrhythmia. The use of in silico experiments of the electrical activity in the hearts of these patients, is emerging as an alternative to traditional, more invasive in situ examinations. One of the principal barriers to the use of in silico experiments is the tremendous amount of computational power required to perform such simulations.Building on an existing code, we create a complete solver for the monodomain model, which describes the electrical activity in the heart. Through extensive optimisations, we manage to efficiently utilise an NVIDIA DGX-2 machine, which is currently the most powerful single-box general-purpose computer with its 16 V100 GPUs.With this solver, we achieve simulation speeds of 2 heartbeats per wall clock minute on the DGX-2 using a realistic unstructured tetrahedral mesh with 11.7 million cells, and we show that the achieved execution time using all 16 GPUs in the DGX-2 is only 30.2\% higher than the theoretical lower bound.},
keywords = {CUDA, electrocardiology, GPU, heterogeneous computing, High-performance computing, monodomain model},
url = {http://urn.nb.no/URN:NBN:no-74080},
author = {Kristian Gregorius {Hustad}}
}
@misc {26853,
title = {Towards Detailed Real-Time Simulations of Cardiac Arrhythmia},
year = {2019},
month = {09/2019},
address = { International Conference in Computing in Cardiology, Singapore},
abstract = {Recent advances in personalized arrhythmia risk prediction show that computational models can provide not only safer but also more accurate results than invasive procedures.\ However, biophysically accurate simulations require solving linear systems over fine meshes and time resolutions, which can take hours or even days. This limits the use of such simulations in the clinic where diagnosis and treatment planning can be time sensitive, even if it is just for the reason of operation schedules. Furthermore, the non-interactive, non-intuitive way of accessing simulations and their results makes it hard to study these collaboratively.Overcoming these limitations requires speeding up computations from hours to seconds, which requires a massive increase in computational capabilities.Fortunately, the cost of computing has fallen dramatically in the past decade. A prominent reason for this is the recent introduction of manycore processors such as GPUs, which by now power the majority of the world{\textquoteright}s leading supercomputers. These devices owe their success to the fact that they are optimized for massively parallel workloads, such as applying similar ODE kernel computations to millions of mesh elements in scientific computing applications. Unlike CPUs, which are typically optimized for sequential performance, this allows GPU architectures to dedicate more transistors to performing computations, thereby increasing parallel speed and energy efficiency.In this poster, we present ongoing work on the parallelization of finite volume computations over an unstructured mesh as well as the challenges involved in building scalable simulation codes and discuss the steps needed to close the gap to accurate real-time computations.\ },
author = {Langguth, Johannes and Arevalo, Hermenegild and Kristian Gregorius {Hustad} and Cai, Xing}
}
@inproceedings {27195,
title = {Towards Detailed Real-Time Simulations of Cardiac Arrhythmia},
journal = {Computing in Cardiology},
volume = {46},
year = {2019},
month = {12/2019},
publisher = {IEEE},
abstract = {Recent advances in personalized arrhythmia risk pre- diction show that computational models can provide not only safer but also more accurate results than invasive pro- cedures. However, biophysically accurate simulations re- quire solving linear systems over fine meshes and time res- olutions, which can take hours or even days. This limits the use of such simulations in the clinic where diagnosis and treatment planning can be time sensitive, even if it is just for the reason of operation schedules. Furthermore, the non-interactive, non-intuitive way of accessing simula- tions and their results makes it hard to study these collab- oratively. Overcoming these limitations requires speeding up computations from hours to seconds, which requires a massive increase in computational capabilities.Fortunately, the cost of computing has fallen dramati- cally in the past decade. A prominent reason for this is the recent introduction of manycore processors such as GPUs, which by now power the majority of the world{\textquoteright}s leading supercomputers. These devices owe their success to the fact that they are optimized for massively parallel work- loads, such as applying similar ODE kernel computations to millions of mesh elements in scientific computing ap- plications. Unlike CPUs, which are typically optimized for sequential performance, this allows GPU architectures to dedicate more transistors to performing computations, thereby increasing parallel speed and energy efficiency.},
author = {Langguth, Johannes and Arevalo, Hermenegild and Kristian Gregorius {Hustad} and Cai, Xing}
}