@misc{16768, author = {Christoph Laaber}, title = {Predicting Unstable Software Benchmarks using Static Source Code Features}, abstract = {Software benchmarks are only as good as the performance measurements they yield. Unstable benchmarks show high variability among repeated measurements, which causes uncertainty about the {\textquotedblleft}true{\textquotedblright} performance of the measured software unit and complicates reliable change assessment. Conversely, if multiple repeated measurements have low variability, i.e., the distribution of the measurement results is narrow, a benchmark is considered stable. However, if a benchmark is stable or unstable only becomes evident after it has been executed, and its results are available.In this paper, we introduce a machine-learning-based approach to predict a benchmark{\textquoteright}s stability without executing it. Our approach statically extracts 58 source code features, for both benchmark code and code called by a benchmark. It parses the abstract syntax trees (ASTs) of all functions, counts the occurrences of each feature for each function, computes the reachable functions from a benchmark with static call graphs (CGs), sums up the feature occurrences for each benchmark, and feeds the features into a binary classifier. Inspired by previous software performance research, the employed features act as proxies for performance variability (and consequently benchmark stability) and are related to: (1) meta information, e.g., lines of code (LOC); (2) programming language elements, e.g., conditionals or loops; (3) potentially performance-impacting standard library calls, e.g., file and network input/output (I/O).To assess our approach{\textquoteright}s effectiveness, we perform a large-scale experiment on 4,461 Go performance benchmarks coming from 230 open-source software (OSS) projects.First, we assess the prediction performance of our machine learning models using 11 binary classification algorithms. We find that Random Forest performs best with good prediction performance from 0.79 to 0.90, and 0.43 to 0.68, in terms of Area Under the Curve (AUC) and Matthews Correlation Coefficient (MCC), respectively.Second, we carry out four sensitivity analyses to investigate the impact on the prediction performance, if the model is trained (1) with different variability thresholds that consider a benchmark as stable or unstable, (2) on benchmark executions with a varying number of measurement repetitions, (3) after applying specific pre-processing steps to remove co-linear and multi-colinear features as well as perform class-rebalancing on the training set, and (4) for different variability measures used as the dependent variable. We find that our model performs best when trained on larger thresholds (10\%) and executions from more repetitions (30). While feature pre-processing does not have an impact across all studied algorithms, removing co-linear and multi-co-linear improves Random Forrest{\textquoteright}s prediction performance by 0.023 MCC and 0.005 AUC. The model shows varying prediction performance, depending on which variability measures is used as dependent variable. However, it performs well across all three studied measures.Third, we perform feature importance analyses for individual features and feature categories. We find that 7 features related to meta-information, slice usage, nested loops, and synchronization application programming interfaces (APIs) are individually important for good predictions; and that the combination of all features of the called source code is paramount for our model, while the combination of features of the benchmark itself is less important.Our results show that although benchmark stability is affected by more than just the source code, we can effectively utilize machine learning models to predict whether a benchmark will be stable or not ahead of execution. This enables spending precious testing time on reliable benchmarks, supporting developers to identify unstable benchmarks during development, allowing unstable benchmarks to be repeated more often, estimating stability in scenarios where repeated benchmark execution is infeasible or impossible, and warning developers if new benchmarks or existing benchmarks executed in new environments will be unstable.}, year = {2022}, journal = {International Conference on Software Engineering (ICSE)}, }