@misc{15031, author = {Carl Rosenberg and Leon Moonen}, title = {Improving Problem Identification via Automated Log Clustering using Dimensionality Reduction}, abstract = {Background: Continuous engineering practices, such as continuousintegration and continuous deployment, see increased adoption inmodern software development. A frequently reported challenge foradopting these practices is the need to make sense of the largeamounts of data that they generate.Goal: We consider the problem of automatically grouping logs of runsthat failed for the same underlying reasons, so that they can betreated more effectively, and investigate the following questions:(1) Does an approach developed to identify problems in system logsgeneralize to identifying problems in continuous deployment logs?(2) How does dimensionality reduction affect the quality of automatedlog clustering? (3) How does the criterion used for merging clustersin the clustering algorithm affect clustering quality?Method: We replicate and extend earlier work on clustering systemlog files to assess its generalization to continuous deploymentlogs. We consider the optional inclusion of one of these dimensionalityreduction techniques: Principal Component Analysis (PCA), LatentSemantic Indexing (LSI), and Non-negative Matrix Factorization(NMF). Moreover, we consider three alternative cluster mergecriteria (Single Linkage, Average Linkage, and Weighted Linkage),in addition to the Complete Linkage criterion used in earlier work.We empirically evaluate the 16 resulting configurations on continuousdeployment logs provided by our industrial collaborator.Results: Our study shows that (1) identifying problems in continuousdeployment logs via clustering is feasible, (2) including NMFsignificantly improves overall accuracy and robustness, and (3)Complete Linkage performs best of all merge criteria analyzed.Conclusions: We conclude that problem identification via automatedlog clustering is improved by including dimensionality reduction,as it decreases the pipeline{\textquoteright}s sensitivity to parameter choice,thereby increasing its robustness for handling different inputs.}, year = {2018}, journal = {12th International Symposium on Empirical Software Engineering and Measurement (ESEM 2018)}, pages = {1-10}, publisher = {ACM}, }