@misc{15833, keywords = {Continuous Engineering, failure diagnosis, log analysis, log mining, spectrum-based analysis}, author = {Carl Rosenberg and Leon Moonen}, title = {Spectrum-Based Log Diagnosis}, abstract = {Background: Continuous Engineering practices are increasingly adopted in modern software development. However, a frequently reported need is for more effective methods to analyze the massive amounts of data resulting from the numerous build and test runs. Aims: We present and evaluate Spectrum-Based Log Diagnosis (SBLD), a method to help developers quickly diagnose problems found in complex integration and deployment runs. Inspired by Spectrum-Based Fault Localization, SBLD leverages the differences in event occurrences between logs for failing and passing runs, to highlight events that are stronger associated with failing runs. Method: Using data provided by our industrial partner, we empirically investigate the following questions: (i) How well does SBLD reduce the effort needed to identify all failure-relevant events in the log for a failing run? (ii) How is the performance of SBLD affected by available data? (iii) How does SBLD compare to searching for simple textual patterns that often occur in failure-relevant events? We answer (i) and (ii) using summary statistics and heatmap visualizations, and for (iii) we compare three configurations of SBLD (with resp. minimum, median and maximum data) against a textual search using Wilcoxon signed-rank tests and the Vargha-Delaney measure of stochastic superiority.Results: Our evaluation shows that (i) SBLD achieves a significant effort reduction for the dataset used, (ii) SBLD benefits from additional logs for passing runs in general, and it benefits from additional logs for failing runs when there is a proportional amount of logs for passing runs in the data. Finally, (iii) SBLD and textual search are roughly equally effective at effort-reduction, while textual search has slightly better recall. We investigate the cause, and discuss how it is due to characteristics of a specific part of our data. Conclusions: We conclude that SBLD shows promise as a method diagnosing failing runs, that its performance is positively affected by additional data, but that it does not outperform textual search on the dataset considered. Future work includes investigating SBLD{\textquoteright}s generalizability on additional datasets.}, year = {2020}, journal = {Empirical Software Engineering and Measurement (ESEM)}, publisher = {ACM}, isbn = {9781450375801}, url = {https://doi.org/10.1145/3382494.3410684}, doi = {10.1145/3382494.3410684}, editor = {Marcos Kalinowski and Federica Sarro}, }