@article{15007, author = {Feroz Zahid and Amir Taherkordi and Ernst Gran and Tor Skeie and Bj{\o}rn Johnsen}, title = {A Self-Adaptive Network for HPC Clouds: Architecture, Framework, and Implementation}, abstract = {Clouds offer flexible and economically attractive compute and storage solutions for enterprises. However, the effectiveness of cloud computing for high-performance computing (HPC) systems still remains questionable. When clouds are deployed on lossless interconnection networks, like InfiniBand (IB), challenges related to load-balancing, low-overhead virtualization, and performance isolation hinder full potential utilization of the underlying interconnect. Moreover, cloud data centers incorporate a highly dynamic environment rendering static network reconfigurations, typically used in IB systems, infeasible. In this paper, we present a framework for a self-adaptive network architecture for HPC clouds based on lossless interconnection networks, demonstrated by means of our implemented IB prototype. Our solution, based on a feedback control and optimization loop, enables the lossless HPC network to dynamically adapt to the varying traffic patterns, current resource availability, workload distributions, and also in accordance with the service provider-defined policies. Furthermore, we present IBAdapt, a simplified ruled-based language for the service providers to specify adaptation strategies used by the framework. Our developed self-adaptive IB network prototype is demonstrated using state-of-the-art industry software. The results obtained on a test cluster demonstrate the feasibility and effectiveness of the framework when it comes to improving Quality-of-Service compliance in HPC clouds.}, year = {2018}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {29}, pages = {2658-2671}, publisher = {IEEE}, doi = {10.1109/TPDS.2018.2842224}, }