@inproceedings{c54a22cdf8454ae294e22be19a1265df,
title = "CARE: Infusing Causal Aware Thinking to Root Cause Analysis in Cloud System",
abstract = "With millions of customers accessing online service all over the world, ensuring high service availability is very critical for cloud system. In recent years, empowered by advanced data mining and machine learning technology, there emerges extensive study on data-driven solution to detect anomalous system behavior and diagnose the root cause. However, without any surveilance of data generation process, the existing passive data-driven approach may lead to biased analysis result induced by observed and unobserved confounding factors in the dynamic and heterogeneous system, and thus affect service availability with misleading mitigation actions. In this paper, we propose to infuse causal thinking to the current data-driven solution for cloud system. We developed CARE, a causal aware root cause discovery engine, which utilizes Random Control Trial to proactively generate less ambiguous data for further analysis. A case study shows the application of CARE to Microsoft Office365.",
keywords = "Cloud system, Reliability, Root cause analysis",
author = "Yong Xu and Xu Zhang and Chuan Luo and Si Qin and Rohit Pandey and Chao Du and Qingwei Lin and Yingnong Dang and Andrew Zhou",
note = "Publisher Copyright: {\textcopyright} 2021 ACM.; 1st Workshop on High Availability and Observability of Cloud Systems, HAOC 2021, held in conjunction with EuroSys 2021 ; Conference date: 26-04-2021 Through 26-04-2021",
year = "2021",
month = apr,
day = "26",
doi = "10.1145/3447851.3458737",
language = "英语",
series = "HAOC 2021 - Proceedings of the 2021 1st Workshop on High Availability and Observability of Cloud Systems",
publisher = "Association for Computing Machinery, Inc",
pages = "1--3",
booktitle = "HAOC 2021 - Proceedings of the 2021 1st Workshop on High Availability and Observability of Cloud Systems",
}