@misc{16551, keywords = {Image and Video Processing (eess.IV), Computer Vision and Pattern Recognition (cs.CV), FOS: Electrical engineering, electronic engineering, information engineering, FOS: Computer and information sciences}, author = {Annika Reinke and Minu Tizabi and Carole Sudre and Matthias Eisenmann and Tim R{\"a}dsch and Michael Baumgartner and Laura Acion and Michela Antonelli and Tal Arbel and Spyridon Bakas and Peter Bankhead and Arriel Benis and Jorge Cardoso and Veronika Cheplygina and Beth Cimini and Gary Collins and Keyvan Farahani and Ben Glocker and Patrick Godau and Fred Hamprecht and Daniel Hashimoto and Doreen Heckmann-N{\"o}tzel and Michael Hoffmann and Merel Huisman and Fabian Isensee and Pierre Jannin and Charles Kahn and Alexandros Karargyris and Alan Karthikesalingam and Bernhard Kainz and Emre Kavur and Hannes Kenngott and Jens Kleesiek and Thijs Kooi and Michal Kozubek and Anna Kreshuk and Tahsin Kurc and Bennett Landman and Geert Litjens and Amin Madani and Klaus Maier-Hein and Anne Martel and Peter Mattson and Erik Meijering and Bjoern Menze and David Moher and Karel Moons and Henning M{\"u}ller and Felix Nickel and Jens Petersen and Gorkem Polat and Nasir Rajpoot and Mauricio Reyes and Nicola Rieke and Michael Riegler and Hassan Rivaz and Julio Saez-Rodriguez and Clarisa Gutierrez and Julien Schroeter and Anindo Saha and Shravya Shetty and Bram Stieltjes and Ronald Summers and Abdel Taha and Sotirios Tsaftaris and Bram van Ginneken and Ga{\"e}l Varoquaux and Manuel Wiesenfarth and Ziv Yaniv and Annette Kopp-Schneider and Paul J{\"a}ger and Lena Maier-Hein}, title = {Common Limitations of Image Processing Metrics: A Picture Story}, abstract = {While the importance of automatic image analysis is continuously increasing, recent meta-research revealed major flaws with respect to algorithm validation. Performance metrics are particularly key for meaningful, objective, and transparent performance assessment and validation of the used automatic algorithms, but relatively little attention has been given to the practical pitfalls when using specific metrics for a given image analysis task. These are typically related to (1) the disregard of inherent metric properties, such as the behaviour in the presence of class imbalance or small target structures, (2) the disregard of inherent data set properties, such as the non-independence of the test cases, and (3) the disregard of the actual biomedical domain interest that the metrics should reflect. This living dynamically document has the purpose to illustrate important limitations of performance metrics commonly applied in the field of image analysis. In this context, it focuses on biomedical image analysis problems that can be phrased as image-level classification, semantic segmentation, instance segmentation, or object detection task. The current version is based on a Delphi process on metrics conducted by an international consortium of image analysis experts from more than 60 institutions worldwide.}, year = {2022}, publisher = {arXiv}, url = {https://arxiv.org/abs/2104.05642}, doi = {10.48550/ARXIV.2104.05642}, }