@article{11282, author = {Frank Sem-Jacobsen and Tor Skeie and Olav Lysne and Jose Duato}, title = {Dynamic Fault Tolerance in Fat-Trees}, abstract = {Fat-trees are a very common communication architecture in current large-scale parallel computers. The probability of failure increases with the number of components. We present a routing method for deterministically and adaptively routed fat trees, applicable to both distributed and source routing, that is able to handle several concurrent faults and that transparently returns to the original routing strategy once the faulty components have recovered. The method is local and dynamic, completely masking the fault from the rest of the system. It only requires a small extra functionality in the switches to handle misrouting around a fault. The method guarantees connectedness and deadlock and livelock freedom for up to radix/2 -1 arbitrary simultaneous faults. Our simulation experiments show a graceful degradation of performance as faults are added. Furthermore, they demonstrate that for most fault combinations, our method will even be able to handle significantly more faults beyond the radix/2-1 limit with high probability.}, year = {2011}, journal = {IEEE Transactions on Computers}, volume = {60}, number = {4}, pages = {508-525}, month = {April}, publisher = {IEEE}, doi = {10.1109/TC.2010.97}, }