@inproceedings{5324b1b8e4e74c809e570bedfd83af3a,
title = "AdapCK: Optimizing I/O for Checkpointing on Large-Scale High Performance Computing Systems",
abstract = "With the scaling-up of high-performance computing (HPC) systems, the resilience has become an important challenge. As a widely used resilience technique for HPC systems, checkpointing saves checkpoints of the system during the execution of parallel programs, and in case of failure, recovers the execution of the program from the most recent checkpoint. However, large-scale parallel programs often produce thousands of processes, and result in large-volume simultaneous data-writings on each checkpoint, which impacts the storage as well as the parallel file systems of HPC. To tackle this problem, this paper proposes AdapCK, an I/O-optimization scheme for checkpointing on large-scale HPC systems. AdapCK consists of two main parts: a load-balancing mechanism used for balancing workloads across low-level storage volumes on checkpointing, and a throughput-aware checkpoint-data writing mechanism used for reducing I/O contentions and increasing utilization of I/O-bandwidth. Experiment results show that the AdapCK can reduce the checkpoint time by more than 30\%, up to 54.5\%.",
keywords = "Checkpoint, DMTCP, Fault tolerance, High-Performance Computing, Parallel file system",
author = "Jie Jia and Yi Liu and Yanke Liu and Yifan Chen and Fang Lin",
note = "Publisher Copyright: {\textcopyright} The Author(s), under exclusive license to Springer Nature Switzerland AG 2024.; 30th International Conference on Parallel and Distributed Computing, Euro-Par 2024 ; Conference date: 26-08-2024 Through 30-08-2024",
year = "2024",
doi = "10.1007/978-3-031-69583-4\_24",
language = "英语",
isbn = "9783031695827",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "342--355",
editor = "Jesus Carretero and Javier Garcia-Blas and Sameer Shende and Ivona Brandic and Katzalin Olcoz and Martin Schreiber",
booktitle = "Euro-Par 2024",
address = "德国",
}