@inproceedings{Dubey2008,
author = {Dubey, Abhishek and Neema, Sandeep and Kowalkowski, Jim and Singh, Amitoj},
booktitle = {Fourth International Conference on e-Science, e-Science 2008, 7-12 December 2008, Indianapolis, IN, {USA},
title = {Scientific Computing Autonomic Reliability Framework},
year = {2008},
pages = {352--353},
abstract = {Large scientific computing clusters require a distributed dependability subsystem that can provide fault isolation and recovery and is capable of learning and predicting failures, to improve the reliability of scientific workflows. In this paper, we outline the key ideas in the design of a Scientific Computing Autonomic Reliability Framework (SCARF) for large computing clusters used in the Lattice Quantum Chromo Dynamics project at Fermi Lab.},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/conf/eScience/DubeyNKS08},
category = {poster},
contribution = {lead},
doi = {10.1109/eScience.2008.113},
file = {:Dubey2008-Scientific_Computing_Autonomic_Reliability_Framework.pdf:PDF},
keywords = {cluster reliability, autonomous systems, fault tolerance, model-based design, monitoring, mitigation strategies, distributed computing},
project = {cps-middleware,cps-reliability},
timestamp = {Wed, 16 Oct 2019 14:14:49 +0200},
url = {https://doi.org/10.1109/eScience.2008.113}
}