[{"data":1,"prerenderedAt":1010},["ShallowReactive",2],{"navigation_docs_zh":3,"blog_zh_how-castrel-builds-an-incident-troubleshooting-agent":280},[4,18,51,254,267],{"title":5,"icon":6,"path":7,"stem":8,"children":9,"page":6},"入门",false,"/zh/docs/getting-started","zh/docs/1.getting-started",[10,14],{"title":11,"path":12,"stem":13},"简介","/zh/docs/getting-started/introduction","zh/docs/1.getting-started/1.introduction",{"title":15,"path":16,"stem":17},"快速开始","/zh/docs/getting-started/quick-start","zh/docs/1.getting-started/2.quick-start",{"title":19,"icon":6,"path":20,"stem":21,"children":22,"page":6},"功能","/zh/docs/features","zh/docs/2.features",[23,27,31,35,39,43,47],{"title":24,"path":25,"stem":26},"告警分诊","/zh/docs/features/alert-triage","zh/docs/2.features/2.alert-triage",{"title":28,"path":29,"stem":30},"事故调查","/zh/docs/features/incident-investigation","zh/docs/2.features/3.incident-investigation",{"title":32,"path":33,"stem":34},"部署验证","/zh/docs/features/deployment-verification","zh/docs/2.features/4.deployment-verification",{"title":36,"path":37,"stem":38},"数据探索","/zh/docs/features/data-exploration","zh/docs/2.features/5.data-exploration",{"title":40,"path":41,"stem":42},"知识库","/zh/docs/features/knowledges","zh/docs/2.features/6.knowledges",{"title":44,"path":45,"stem":46},"Castrel Proxy","/zh/docs/features/castrel-proxy","zh/docs/2.features/7.castrel-proxy",{"title":48,"path":49,"stem":50},"自动化","/zh/docs/features/automations","zh/docs/2.features/8.automations",{"title":52,"icon":6,"path":53,"stem":54,"children":55},"集成","/zh/docs/integrations","zh/docs/3.integrations/index",[56,57,62,67,72,77,81,85,89,94,99,104,109,113,117,122,127,131,136,141,146,151,156,160,165,170,174,178,183,188,193,198,203,208,212,216,220,224,229,234,239,244,249],{"title":52,"path":53,"stem":54},{"title":58,"path":59,"stem":60,"icon":61},"Prometheus","/zh/docs/integrations/prometheus","zh/docs/3.integrations/1.prometheus","i-simple-icons-prometheus",{"title":63,"path":64,"stem":65,"icon":66},"AWS","/zh/docs/integrations/aws","zh/docs/3.integrations/10.aws","i-simple-icons-amazonwebservices",{"title":68,"path":69,"stem":70,"icon":71},"阿里云（Aliyun）","/zh/docs/integrations/aliyun","zh/docs/3.integrations/12.aliyun","i-simple-icons-alibabacloud",{"title":73,"path":74,"stem":75,"icon":76},"腾讯云（Tencent Cloud）","/zh/docs/integrations/tencent-cloud","zh/docs/3.integrations/13.tencent-cloud","i-lucide-plug",{"title":78,"path":79,"stem":80,"icon":76},"华为云（Huawei Cloud）","/zh/docs/integrations/huaweicloud","zh/docs/3.integrations/14.huaweicloud",{"title":82,"path":83,"stem":84,"icon":76},"火山引擎（Volcengine）","/zh/docs/integrations/volcengine","zh/docs/3.integrations/15.volcengine",{"title":86,"path":87,"stem":88,"icon":76},"轻帆云（QingFanYun）","/zh/docs/integrations/qingfanyun","zh/docs/3.integrations/16.qingfanyun",{"title":90,"path":91,"stem":92,"icon":93},"Grafana","/zh/docs/integrations/grafana","zh/docs/3.integrations/17.grafana","i-simple-icons-grafana",{"title":95,"path":96,"stem":97,"icon":98},"VictoriaMetrics","/zh/docs/integrations/victoriametrics","zh/docs/3.integrations/18.victoriametrics","i-simple-icons-victoriametrics",{"title":100,"path":101,"stem":102,"icon":103},"New Relic","/zh/docs/integrations/new-relic","zh/docs/3.integrations/19.new-relic","i-simple-icons-newrelic",{"title":105,"path":106,"stem":107,"icon":108},"Elasticsearch","/zh/docs/integrations/elasticsearch","zh/docs/3.integrations/2.elasticsearch","i-simple-icons-elasticsearch",{"title":110,"path":111,"stem":112,"icon":76},"Zabbix","/zh/docs/integrations/zabbix","zh/docs/3.integrations/20.zabbix",{"title":114,"path":115,"stem":116,"icon":76},"监控宝（JianKongBao）","/zh/docs/integrations/jiankongbao","zh/docs/3.integrations/21.jiankongbao",{"title":118,"path":119,"stem":120,"icon":121},"PagerDuty","/zh/docs/integrations/pagerduty","zh/docs/3.integrations/22.pagerduty","i-simple-icons-pagerduty",{"title":123,"path":124,"stem":125,"icon":126},"Sentry","/zh/docs/integrations/sentry","zh/docs/3.integrations/23.sentry","i-simple-icons-sentry",{"title":128,"path":129,"stem":130,"icon":76},"Freshworks / Freshservice","/zh/docs/integrations/freshworks","zh/docs/3.integrations/24.freshworks",{"title":132,"path":133,"stem":134,"icon":135},"Linear","/zh/docs/integrations/linear","zh/docs/3.integrations/25.linear","i-simple-icons-linear",{"title":137,"path":138,"stem":139,"icon":140},"ClickHouse","/zh/docs/integrations/clickhouse","zh/docs/3.integrations/26.clickhouse","i-simple-icons-clickhouse",{"title":142,"path":143,"stem":144,"icon":145},"Kubernetes","/zh/docs/integrations/kubernetes","zh/docs/3.integrations/27.kubernetes","i-simple-icons-kubernetes",{"title":147,"path":148,"stem":149,"icon":150},"Terraform Cloud / HCP Terraform","/zh/docs/integrations/terraform","zh/docs/3.integrations/28.terraform","i-simple-icons-terraform",{"title":152,"path":153,"stem":154,"icon":155},"Jenkins","/zh/docs/integrations/jenkins","zh/docs/3.integrations/29.jenkins","i-simple-icons-jenkins",{"title":157,"path":158,"stem":159,"icon":93},"Grafana Loki","/zh/docs/integrations/grafana-loki","zh/docs/3.integrations/3.grafana-loki",{"title":161,"path":162,"stem":163,"icon":164},"Ansible / AWX","/zh/docs/integrations/ansible","zh/docs/3.integrations/30.ansible","i-simple-icons-ansible",{"title":166,"path":167,"stem":168,"icon":169},"GitLab","/zh/docs/integrations/gitlab","zh/docs/3.integrations/31.gitlab","i-simple-icons-gitlab",{"title":171,"path":172,"stem":173,"icon":76},"钉钉（DingTalk）","/zh/docs/integrations/dingtalk","zh/docs/3.integrations/32.dingtalk",{"title":175,"path":176,"stem":177,"icon":76},"飞书（Feishu / Lark）","/zh/docs/integrations/feishu","zh/docs/3.integrations/33.feishu",{"title":179,"path":180,"stem":181,"icon":182},"Telegram","/zh/docs/integrations/telegram","zh/docs/3.integrations/34.telegram","i-simple-icons-telegram",{"title":184,"path":185,"stem":186,"icon":187},"Email","/zh/docs/integrations/email","zh/docs/3.integrations/35.email","i-simple-icons-gmail",{"title":189,"path":190,"stem":191,"icon":192},"微信企业机器人（Weixin Clawbot）","/zh/docs/integrations/weixin-clawbot","zh/docs/3.integrations/36.weixin-clawbot","i-simple-icons-wechat",{"title":194,"path":195,"stem":196,"icon":197},"Notion","/zh/docs/integrations/notion","zh/docs/3.integrations/37.notion","i-simple-icons-notion",{"title":199,"path":200,"stem":201,"icon":202},"Confluence","/zh/docs/integrations/confluence","zh/docs/3.integrations/38.confluence","i-simple-icons-confluence",{"title":204,"path":205,"stem":206,"icon":207},"Google Docs","/zh/docs/integrations/google-docs","zh/docs/3.integrations/39.google-docs","i-simple-icons-googledocs",{"title":209,"path":210,"stem":211,"icon":93},"Grafana Tempo","/zh/docs/integrations/grafana-tempo","zh/docs/3.integrations/4.grafana-tempo",{"title":213,"path":214,"stem":215,"icon":76},"钉钉文档（DingTalk Docs）","/zh/docs/integrations/dingtalk-docs","zh/docs/3.integrations/40.dingtalk-docs",{"title":217,"path":218,"stem":219,"icon":76},"LDAP","/zh/docs/integrations/ldap","zh/docs/3.integrations/41.ldap",{"title":221,"path":222,"stem":223,"icon":76},"Dify","/zh/docs/integrations/dify","zh/docs/3.integrations/42.dify",{"title":225,"path":226,"stem":227,"icon":228},"自定义 MCP（Custom MCP）","/zh/docs/integrations/custom-mcp","zh/docs/3.integrations/43.custom-mcp","i-simple-icons-anthropic",{"title":230,"path":231,"stem":232,"icon":233},"GitHub","/zh/docs/integrations/github","zh/docs/3.integrations/5.github","i-simple-icons-github",{"title":235,"path":236,"stem":237,"icon":238},"Slack","/zh/docs/integrations/slack","zh/docs/3.integrations/6.slack","i-simple-icons-slack",{"title":240,"path":241,"stem":242,"icon":243},"Vercel","/zh/docs/integrations/vercel","zh/docs/3.integrations/7.vercel","i-simple-icons-vercel",{"title":245,"path":246,"stem":247,"icon":248},"Graylog","/zh/docs/integrations/graylog","zh/docs/3.integrations/8.graylog","i-simple-icons-graylog",{"title":250,"path":251,"stem":252,"icon":253},"Datadog","/zh/docs/integrations/datadog","zh/docs/3.integrations/9.datadog","i-simple-icons-datadog",{"title":255,"path":256,"stem":257,"children":258,"page":6},"更多","/zh/docs/more","zh/docs/4.more",[259,263],{"title":260,"path":261,"stem":262},"路线图","/zh/docs/more/roadmap","zh/docs/4.more/1.roadmap",{"title":264,"path":265,"stem":266},"支持","/zh/docs/more/support","zh/docs/4.more/2.support",{"title":268,"path":269,"stem":270,"children":271,"page":6},"安全","/zh/docs/security","zh/docs/5.security",[272,276],{"title":273,"path":274,"stem":275},"隐私政策","/zh/docs/security/privacy-policy","zh/docs/5.security/1.privacy-policy",{"title":277,"path":278,"stem":279},"服务条款","/zh/docs/security/terms-of-service","zh/docs/5.security/2.terms-of-service",{"id":281,"title":282,"body":283,"description":997,"extension":998,"meta":999,"navigation":6,"path":1005,"seo":1006,"stem":1008,"__hash__":1009},"blogs_zh/zh/blogs/1.how-castrel-builds-an-incident-troubleshooting-agent.md","Castrel 如何构建事故排障 Agent",{"type":284,"value":285,"toc":966},"minimark",[286,304,307,311,316,318,323,330,333,337,400,403,406,413,429,432,435,455,460,462,466,470,473,479,506,509,512,515,584,587,589,593,599,603,623,626,640,643,654,657,659,663,666,669,672,677,683,690,693,696,710,713,716,719,725,728,730,734,740,743,746,753,822,829,837,840,843,846,866,869,872,883,886,892,895,897,901,963],[287,288,291,292,296,297,296,300,303],"callout",{"color":289,"icon":290},"info","i-lucide-info","本文介绍 Castrel 事故排障 Agent 的核心设计理念，包含：",[293,294,295],"strong",{},"假设驱动调查","、",[293,298,299],{},"人机协作",[293,301,302],{},"业务知识沉淀","，帮助团队从“发现问题”到“定位根因或快速升级”形成高效闭环。",[305,306],"hr",{},[308,309,310],"p",{},"下图展示了 Castrel 事故排障 Agent 的核心工作流。",[312,313],"mermaid",{":config":314,"code":315},"config","%25%25%7Binit%3A%20%7B'flowchart'%3A%20%7B'subGraphTitleMargin'%3A%20%7B'top'%3A%200%2C%20'bottom'%3A%200%7D%7D%7D%7D%25%25%0Agraph%20TB%0A%20%20%20%20subgraph%20main%5B%22%20%22%5D%0A%20%20%20%20%20%20%20%20direction%20LR%0A%20%20%20%20%20%20%20%20A%5B%E7%97%87%E7%8A%B6%5D%0A%0A%20%20%20%20%20%20%20%20subgraph%20Y%5B%22%E4%BA%8B%E6%95%85%E5%88%86%E6%9E%90%22%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20direction%20LR%0A%20%20%20%20%20%20%20%20%20%20%20%20B%5B%E7%94%9F%E6%88%90%E5%81%87%E8%AE%BE%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20C%5B%E6%94%B6%E9%9B%86%E4%B8%8A%E4%B8%8B%E6%96%87%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20D%5B%E9%AA%8C%E8%AF%81%E5%81%87%E8%AE%BE%5D%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20B%20--%3E%20C%0A%20%20%20%20%20%20%20%20%20%20%20%20C%20--%3E%20D%0A%20%20%20%20%20%20%20%20%20%20%20%20D%20--%3E%20B%0A%20%20%20%20%20%20%20%20end%0A%0A%20%20%20%20%20%20%20%20F%5B%E6%8E%92%E9%9A%9C%E6%8A%A5%E5%91%8A%5D%0A%0A%20%20%20%20%20%20%20%20A%20--%3E%20%7C%E8%A7%A6%E5%8F%91%7C%20Y%0A%20%20%20%20%20%20%20%20Y%20--%3E%20%7C%E9%80%80%E5%87%BA%7C%20F%0A%20%20%20%20end%0A%0A%20%20%20%20Z%5B%E4%BA%BA%E7%B1%BB%5D%0A%0A%20%20%20%20subgraph%20X%5B%22%E4%BA%8B%E6%95%85%E7%9F%A5%E8%AF%86%22%5D%0A%20%20%20%20%20%20%20%20X1%5B%22%E5%8E%86%E5%8F%B2%E4%BA%8B%E6%95%85%22%5D%0A%20%20%20%20%20%20%20%20X2%5B%22Runbook%22%5D%0A%20%20%20%20end%0A%0A%20%20%20%20X%20--%3E%20%7C%E5%8F%AC%E5%9B%9E%7C%20main%0A%20%20%20%20main%20--%3E%20%7C%E6%B2%89%E6%B7%80%7C%20X%0A%20%20%20%20Z%20--%3E%20%7C%E5%8F%8D%E9%A6%88%7C%20main%0A%20%20%20%20main%20--%3E%20%7C%E5%B1%95%E7%A4%BA%E7%BB%93%E6%9E%9C%7C%20Z",[305,317],{},[319,320,322],"h2",{"id":321},"_1-可观测性上下文","1. 可观测性上下文",[308,324,325],{},[326,327],"img",{"alt":328,"src":329},"AI Analysis and Observability Data","/images/blog/ai-troubleshooting/observability.png",[308,331,332],{},"AI 排障效果很大程度取决于它能够访问到的上下文数据。完整的可观测性上下文应包含以下维度。",[334,335,336],"h3",{"id":336},"三类核心可观测性数据",[338,339,340,357],"table",{},[341,342,343],"thead",{},[344,345,346,351,354],"tr",{},[347,348,350],"th",{"align":349},"left","数据类型",[347,352,353],{"align":349},"作用",[347,355,356],{"align":349},"常见来源",[358,359,360,374,387],"tbody",{},[344,361,362,368,371],{},[363,364,365],"td",{"align":349},[293,366,367],{},"Metrics",[363,369,370],{"align":349},"发现异常、量化问题严重程度",[363,372,373],{"align":349},"Prometheus、Zabbix、CloudWatch",[344,375,376,381,384],{},[363,377,378],{"align":349},[293,379,380],{},"Logs",[363,382,383],{"align":349},"定位具体错误、获得上下文细节",[363,385,386],{"align":349},"Elasticsearch、Loki、Splunk",[344,388,389,394,397],{},[363,390,391],{"align":349},[293,392,393],{},"Traces",[363,395,396],{"align":349},"追踪请求路径、定位慢调用位置",[363,398,399],{"align":349},"Jaeger、Tempo、SkyWalking",[308,401,402],{},"仅依赖任意单一数据类型都很难高效排障。Metrics 告诉你“出了问题”，Logs 告诉你“具体是什么错”，Traces 告诉你“链路上哪里出了问题”。",[334,404,405],{"id":405},"调用关系与部署关系",[308,407,408,409,412],{},"除了三类可观测性数据，AI 还需要理解系统的",[293,410,411],{},"拓扑关系","：",[414,415,416,423],"ul",{},[417,418,419,422],"li",{},[293,420,421],{},"调用关系","：服务之间的依赖关系（通常由 APM 提供）",[417,424,425,428],{},[293,426,427],{},"部署关系","：服务运行在哪些主机/容器上（可来自 APM、Zabbix 或 Kubernetes）",[308,430,431],{},"有了调用关系，AI 才能判断故障是从上游传导而来，还是当前服务自身问题；有了部署关系，AI 才能关联基础设施层面的异常（例如主机 CPU 飙升、磁盘写满）。",[334,433,434],{"id":434},"实践建议",[414,436,437,443,449],{},[417,438,439,442],{},[293,440,441],{},"优先接入 APM","：APM 通常可同时提供 Traces、调用关系和部署关系，是性价比最高的数据源",[417,444,445,448],{},[293,446,447],{},"补齐基础设施监控","：来自 Zabbix、Node Exporter 等的主机级指标是关键补充",[417,450,451,454],{},[293,452,453],{},"Kubernetes 元数据","：如果使用 K8s，其 Events、Pod 状态、Deployment 变更记录都属于关键上下文",[287,456,459],{"color":457,"icon":458},"primary","i-lucide-trophy","数据越完整，AI 分析越准确。缺少任何一种数据类型都会显著降低排障效率。",[305,461],{},[319,463,465],{"id":464},"_2-假设驱动","2. 假设驱动",[334,467,469],{"id":468},"核心思想像人类-sre-一样思考","核心思想：像人类 SRE 一样思考",[308,471,472],{},"传统 AI 分析方法通常是先收集大量遥测数据，再让模型一次性总结。这种“摘要引擎”模式有明显局限：数据量越大，模型越容易被无关信号干扰，输出质量反而下降。",[308,474,475,476,412],{},"更高效的方式是让 AI ",[293,477,478],{},"像人类 SRE 一样工作",[480,481,482,488,494,500],"ol",{},[417,483,484,487],{},[293,485,486],{},"提出假设","：根据告警和初步数据生成可能根因假设",[417,489,490,493],{},[293,491,492],{},"验证假设","：针对每条假设查询特定遥测数据进行验证",[417,495,496,499],{},[293,497,498],{},"递归下钻","：当某条假设被验证后，继续生成更深一层子假设",[417,501,502,505],{},[293,503,504],{},"及时剪枝","：当某条假设被证伪时，立刻剪掉该分支，聚焦其他方向",[334,507,508],{"id":508},"假设分支策略",[312,510],{":config":314,"code":511},"flowchart%20TD%0A%20%20%20%20A%5B%E5%91%8A%E8%AD%A6%EF%BC%9AAPI%20P95%20%E5%BB%B6%E8%BF%9F%E7%AA%81%E5%A2%9E%5D%20--%3E%20B%5B%E6%8F%90%E5%87%BA%E5%88%9D%E5%A7%8B%E5%81%87%E8%AE%BE%5D%0A%0A%20%20%20%20B%20--%3E%20C%7B%E5%81%87%E8%AE%BE%201%EF%BC%9A%E6%95%B0%E6%8D%AE%E5%BA%93%E6%80%A7%E8%83%BD%E9%97%AE%E9%A2%98%7D%0A%20%20%20%20B%20--%3E%20D%7B%E5%81%87%E8%AE%BE%202%EF%BC%9A%E7%BD%91%E7%BB%9C%E6%97%B6%E5%BB%B6%7D%0A%20%20%20%20B%20--%3E%20E%7B%E5%81%87%E8%AE%BE%203%EF%BC%9A%E8%B5%84%E6%BA%90%E4%B8%8D%E8%B6%B3%7D%0A%0A%20%20%20%20C%20--%3E%7C%22%E6%9F%A5%E8%AF%A2%20DB%20%E6%8C%87%E6%A0%87%EF%BC%8C%E5%8F%91%E7%8E%B0%E6%85%A2%E6%9F%A5%E8%AF%A2%E5%A2%9E%E5%8A%A0%22%7C%20F%5B%E9%AA%8C%E8%AF%81%E9%80%9A%E8%BF%87%5D%0A%20%20%20%20D%20--%3E%7C%22%E6%A3%80%E6%9F%A5%E7%BD%91%E7%BB%9C%E6%8C%87%E6%A0%87%EF%BC%8C%E6%97%A0%E5%BC%82%E5%B8%B8%22%7C%20G%5B%E6%8E%92%E9%99%A4%5D%0A%20%20%20%20E%20--%3E%7C%22%E6%A3%80%E6%9F%A5%20CPU%2F%E5%86%85%E5%AD%98%EF%BC%8C%E8%B5%84%E6%BA%90%E5%85%85%E8%B6%B3%22%7C%20G%0A%0A%20%20%20%20F%20--%3E%20H%7B%E5%AD%90%E5%81%87%E8%AE%BE%201.1%EF%BC%9A%E6%89%A7%E8%A1%8C%E8%AE%A1%E5%88%92%E5%8F%98%E5%8C%96%7D%0A%20%20%20%20F%20--%3E%20I%7B%E5%AD%90%E5%81%87%E8%AE%BE%201.2%EF%BC%9A%E8%BF%9E%E6%8E%A5%E6%B1%A0%E8%80%97%E5%B0%BD%7D%0A%0A%20%20%20%20H%20--%3E%7C%22%E5%AF%B9%E6%AF%94%E6%89%A7%E8%A1%8C%E8%AE%A1%E5%88%92%EF%BC%8C%E5%8F%91%E7%8E%B0%E5%85%A8%E8%A1%A8%E6%89%AB%E6%8F%8F%22%7C%20J%5B%E9%AA%8C%E8%AF%81%E9%80%9A%E8%BF%87%5D%0A%20%20%20%20I%20--%3E%7C%22%E8%BF%9E%E6%8E%A5%E6%95%B0%E6%AD%A3%E5%B8%B8%22%7C%20G%0A%0A%20%20%20%20J%20--%3E%20K%7B%E5%AD%90%E5%81%87%E8%AE%BE%201.1.1%EF%BC%9A%E7%B4%A2%E5%BC%95%E5%8F%98%E6%9B%B4%7D%0A%20%20%20%20J%20--%3E%20L%7B%E5%AD%90%E5%81%87%E8%AE%BE%201.1.2%EF%BC%9A%E6%95%B0%E6%8D%AE%E9%87%8F%E6%BF%80%E5%A2%9E%7D%0A%0A%20%20%20%20K%20--%3E%7C%22%E6%A3%80%E6%9F%A5%20DDL%20%E8%AE%B0%E5%BD%95%EF%BC%8C%E5%8F%91%E7%8E%B0%E7%B4%A2%E5%BC%95%E8%A2%AB%E5%88%A0%E9%99%A4%22%7C%20M%5B%E6%A0%B9%E5%9B%A0%E7%A1%AE%E8%AE%A4%5D%0A%20%20%20%20L%20--%3E%7C%22%E6%95%B0%E6%8D%AE%E9%87%8F%E7%A8%B3%E5%AE%9A%22%7C%20G",[334,513,514],{"id":514},"与传统方法对比",[338,516,517,530],{},[341,518,519],{},[344,520,521,524,527],{},[347,522,523],{"align":349},"维度",[347,525,526],{"align":349},"传统摘要模式",[347,528,529],{"align":349},"假设驱动模式",[358,531,532,545,558,571],{},[344,533,534,539,542],{},[363,535,536],{"align":349},[293,537,538],{},"数据处理方式",[363,540,541],{"align":349},"一次性收集全部数据",[363,543,544],{"align":349},"按需查询特定数据",[344,546,547,552,555],{},[363,548,549],{"align":349},[293,550,551],{},"噪音干扰",[363,553,554],{"align":349},"易被无关异常带偏",[363,556,557],{"align":349},"聚焦因果链路",[344,559,560,565,568],{},[363,561,562],{"align":349},[293,563,564],{},"调查深度",[363,566,567],{"align":349},"停留在表层症状",[363,569,570],{"align":349},"递归下钻到根因",[344,572,573,578,581],{},[363,574,575],{"align":349},[293,576,577],{},"可解释性",[363,579,580],{"align":349},"结论难以追溯",[363,582,583],{"align":349},"具备完整假设验证链",[287,585,586],{"color":457,"icon":458},"假设驱动调查让 AI 的分析过程透明、可追溯，每个结论都有数据支撑。",[305,588],{},[319,590,592],{"id":591},"_3-人机协作","3. 人机协作",[308,594,595,596,412],{},"假设驱动调查要发挥最大效果，需要人和 AI 双向协作。传统 AI 分析是单向的：AI 给结论，用户接受或拒绝。更高效的模式是",[293,597,598],{},"双向协作",[334,600,602],{"id":601},"ai-的角色","AI 的角色",[414,604,605,611,617],{},[417,606,607,610],{},[293,608,609],{},"掌握通用知识","：理解系统架构、故障模式和排障方法论",[417,612,613,616],{},[293,614,615],{},"快速扫描数据","：在海量 Metrics、Logs、Traces 中快速定位异常",[417,618,619,622],{},[293,620,621],{},"并行验证假设","：同时验证多个可能的根因方向",[334,624,625],{"id":625},"人的角色",[414,627,628,634],{},[417,629,630,633],{},[293,631,632],{},"业务上下文","：告诉 AI 业务逻辑，比如“晚上 9 点 CPU 高是活动流量，属正常现象”",[417,635,636,639],{},[293,637,638],{},"经验输入","：提供历史经验，比如“我们网络一直不稳，上次是 DNS 抖动”",[334,641,642],{"id":642},"典型协作场景",[644,645,650],"pre",{"className":646,"code":648,"language":649},[647],"language-text","AI:   （完成初步假设分析）\n      \"已验证 3 条假设：\n       ✓ 数据库慢查询增加 - 与告警时间匹配\n       ✗ 网络时延 - 无异常\n       ✗ 资源不足 - CPU/内存充足\n\n       正在沿数据库方向继续深挖...\"\n\nUser: （看到假设列表，想起上周变更）\n      \"DBA 上周改过订单表索引\"\n\nAI:   （新增假设并验证）\n      \"新增假设：索引变更导致执行计划变化\n       → 检查 DDL 变更记录：发现上周四删除了订单表索引\n       → 对比执行计划：删除前走索引扫描，删除后变为全表扫描\n       → 时间关联：索引删除时间与慢查询开始时间一致\n       ✓ 假设成立，确认根因\"\n\nUser: （验证成功）\n      \"确认了！需要恢复索引。\"\n","text",[651,652,648],"code",{"__ignoreMap":653},"",[287,655,656],{"color":457,"icon":458},"AI 擅长处理海量数据和通用知识，人擅长提供业务上下文和历史经验。双向协作能让排障效率远超纯 AI 或纯人工。",[305,658],{},[319,660,662],{"id":661},"_4-退出策略","4. 退出策略",[308,664,665],{},"AI 并不总能直接找到根因，尤其在数据接入不完整时。但这并不意味着 AI 分析没有价值。",[334,667,668],{"id":668},"多组件问题的深度排查",[308,670,671],{},"复杂事故的根因可能横跨多个系统，或需要多层推理才能找到。假设驱动允许 AI 递归下钻，直到搜索空间被穷尽。",[308,673,674],{},[293,675,676],{},"案例：Pod 频繁重启（CrashLoopBackOff）",[644,678,681],{"className":679,"code":680,"language":649},[647],"告警：Kubernetes Pod 进入 CrashLoopBackOff 状态\n\n第一层分析：\n  → 假设：内存不足触发 OOM Kill\n  → 验证：检查 Pod 事件，确认 OOMKilled\n  → 结论：假设成立，但这只是表层原因\n\n第二层分析（递归深挖）：\n  → 假设：请求负载异常增大导致内存突增\n  → 验证：检查入流量，发现 Kafka 消息体异常变大\n  → 结论：假设成立，继续下钻\n\n第三层分析：\n  → 假设：上游系统发送了异常大消息\n  → 验证：追查消息来源，发现某批次数据包含损坏的大文件\n  → 结论：根因确认 - 上游数据异常导致消息尺寸溢出\n",[651,682,680],{"__ignoreMap":653},[308,684,685,686,689],{},"早期版本 AI 可能在第一层就停止，只给出“Pod OOM”结论。但这对工程师帮助有限，因为告警本身已经告诉了这件事。真正有价值的是找到",[293,687,688],{},"为什么会 OOM","。",[334,691,692],{"id":692},"排除干扰项的价值",[308,694,695],{},"即使 AI 缺少足够数据、无法直接锁定根因，它通常仍能：",[480,697,698,704],{},[417,699,700,703],{},[293,701,702],{},"指出大致排查方向","：比如“问题更可能在数据库层”或“与最近部署变更相关”",[417,705,706,709],{},[293,707,708],{},"排除无关干扰项","：比如确认网络连通正常、资源利用率充足、缓存命中率无异常",[308,711,712],{},"这种“排除法”本身就能节省大量时间。传统排障里，工程师往往要先逐项检查网络、资源、缓存等基础设施，再排除这些可能性。AI 能在几分钟内完成这一步，让用户直接聚焦最可能的问题方向。",[334,714,715],{"id":715},"上下文交接",[308,717,718],{},"当 AI 因数据不足无法继续深挖时，它可以向用户输出结构化上下文交接：",[644,720,723],{"className":721,"code":722,"language":649},[647],"📋 排查进度交接\n\n⏱️ 分析耗时：5 分钟 | 已扫描组件：12 个\n\n✅ 已排除：\n• 网络连通正常（Ping \u003C1ms，无丢包）\n• K8s 资源充足（CPU \u003C60%，Memory \u003C70%）\n• 缓存命中率正常（Redis 99.2%）\n\n🎯 重点方向：\n• 问题集中在 order-service → mysql-cluster 链路\n• 数据库性能相关问题概率较高\n\n⚠️ 需人工确认（缺失数据源）：\n• 数据库慢查询日志（未接入）\n• 最近 Schema 变更记录（未接入）\n",[651,724,722],{"__ignoreMap":653},[287,726,727],{"color":457,"icon":458},"前期扫描结果不会浪费。即使 AI 不能给出最终答案，用户也能从更小的排查范围起步，而不是从零开始。",[305,729],{},[319,731,733],{"id":732},"_5-知识沉淀","5. 知识沉淀",[308,735,736],{},[326,737],{"alt":738,"src":739},"Knowledge Accumulation and Experience Reuse","/images/blog/ai-troubleshooting/knowledge-accumulation.png",[308,741,742],{},"如果没有 SOP 或 Runbook，AI 首次遇到某类问题时可能需要大量探索。但这些探索结果不应被浪费。",[334,744,745],{"id":745},"因果验证的复杂性",[308,747,748,749,752],{},"假设驱动调查的核心是",[293,750,751],{},"验证因果关系","，即判断某个异常是否真的导致了当前告警。但因果验证远比看起来复杂：",[338,754,755,768],{},[341,756,757],{},[344,758,759,762,765],{},[347,760,761],{"align":349},"验证维度",[347,763,764],{"align":349},"说明",[347,766,767],{"align":349},"挑战",[358,769,770,783,796,809],{},[344,771,772,777,780],{},[363,773,774],{"align":349},[293,775,776],{},"时间相关性",[363,778,779],{"align":349},"异常出现时间是否与告警时间匹配",[363,781,782],{"align":349},"分布式系统中可能存在时间戳偏移",[344,784,785,790,793],{},[363,786,787],{"align":349},[293,788,789],{},"传播路径",[363,791,792],{"align":349},"异常是否位于告警对象的上下游链路",[363,794,795],{"align":349},"需要完整调用拓扑图",[344,797,798,803,806],{},[363,799,800],{"align":349},[293,801,802],{},"影响范围",[363,804,805],{"align":349},"异常影响的资源是否与告警相关",[363,807,808],{"align":349},"需要理解资源之间依赖关系",[344,810,811,816,819],{},[363,812,813],{"align":349},[293,814,815],{},"业务语义",[363,817,818],{"align":349},"该异常在业务层面是否合理",[363,820,821],{"align":349},"需要深度理解业务逻辑",[308,823,824,825,828],{},"其中最后一项“业务语义”尤其依赖",[293,826,827],{},"对客户业务的深度理解","。例如：",[414,830,831,834],{},[417,832,833],{},"订单服务延迟升高，AI 发现数据库有慢查询。但这个慢查询究竟是“每天零点跑、与线上业务无关的报表任务”，还是核心下单查询？只有懂业务的人才能判断。",[417,835,836],{},"某服务错误率上升，AI 发现最近有代码部署。但这个部署是“新功能灰度（预期会有少量错误）”，还是意外 bug？这需要结合发布计划判断。",[308,838,839],{},"这类业务知识无法直接从遥测数据里获得，必须通过知识沉淀逐步积累。",[334,841,842],{"id":842},"从排障过程沉淀知识",[308,844,845],{},"当一次事故调查结束后，AI 可将过程总结为知识条目：",[414,847,848,854,860],{},[417,849,850,853],{},[293,851,852],{},"问题特征","：本次调查由哪些告警/症状组合触发",[417,855,856,859],{},[293,857,858],{},"调查路径","：尝试了哪些方向，最终定位到什么根因",[417,861,862,865],{},[293,863,864],{},"解决方案","：如何修复、需要注意什么",[334,867,868],{"id":868},"绑定到特定告警与资源",[308,870,871],{},"这些知识可绑定到具体告警类型或资源。当下次遇到相似问题时：",[480,873,874,877,880],{},[417,875,876],{},"AI 自动召回相关知识",[417,878,879],{},"参考上次调查路径，快速确认是否同类问题",[417,881,882],{},"如果症状一致，直接给出修复建议；即便不一致，也能先排除该方向",[334,884,885],{"id":885},"示例场景",[644,887,890],{"className":888,"code":889,"language":649},[647],"第一次：\n• 告警：order-service P95 延迟上升\n• 调查过程：查网络 → 查资源 → 查数据库 → 发现索引问题\n• 沉淀知识：绑定到 order-service + 延迟类告警\n\n第二次：\n• 同类告警再次触发\n• AI 自动关联知识：\"上次相似问题由索引导致，是否优先检查数据库？\"\n• 用户确认后，直接跳过网络和资源排查，进入数据库检查\n• 调查耗时从 30 分钟降到 5 分钟\n",[651,891,889],{"__ignoreMap":653},[287,893,894],{"color":457,"icon":458},"因果验证的准确性取决于对业务的深度理解。通过知识沉淀，团队经验不再只存在于个人脑中，而是成为 AI 做因果判断的重要依据。",[305,896],{},[319,898,900],{"id":899},"_6-总结","6. 总结",[338,902,903,912],{},[341,904,905],{},[344,906,907,910],{},[347,908,909],{"align":349},"能力",[347,911,764],{"align":349},[358,913,914,924,934,943,953],{},[344,915,916,921],{},[363,917,918],{"align":349},[293,919,920],{},"可观测性上下文",[363,922,923],{"align":349},"集成 Metrics、Logs、Traces 与调用拓扑",[344,925,926,931],{},[363,927,928],{"align":349},[293,929,930],{},"假设驱动",[363,932,933],{"align":349},"提假设 → 验证 → 递归下钻，而非简单摘要",[344,935,936,940],{},[363,937,938],{"align":349},[293,939,299],{},[363,941,942],{"align":349},"AI 扫描数据，人类提供业务上下文和历史经验",[344,944,945,950],{},[363,946,947],{"align":349},[293,948,949],{},"退出策略",[363,951,952],{"align":349},"即便无法锁定根因，也能排除干扰项并输出关键结论",[344,954,955,960],{},[363,956,957],{"align":349},[293,958,959],{},"知识沉淀",[363,961,962],{"align":349},"沉淀业务知识，提升后续排障准确性与效率",[308,964,965],{},"Castrel 事故排障 Agent 的目标不是“AI 取代人”，而是让人机协作效率显著超过纯 AI 或纯人工。",{"title":653,"searchDepth":967,"depth":967,"links":968},2,[969,975,980,985,990,996],{"id":321,"depth":967,"text":322,"children":970},[971,973,974],{"id":336,"depth":972,"text":336},3,{"id":405,"depth":972,"text":405},{"id":434,"depth":972,"text":434},{"id":464,"depth":967,"text":465,"children":976},[977,978,979],{"id":468,"depth":972,"text":469},{"id":508,"depth":972,"text":508},{"id":514,"depth":972,"text":514},{"id":591,"depth":967,"text":592,"children":981},[982,983,984],{"id":601,"depth":972,"text":602},{"id":625,"depth":972,"text":625},{"id":642,"depth":972,"text":642},{"id":661,"depth":967,"text":662,"children":986},[987,988,989],{"id":668,"depth":972,"text":668},{"id":692,"depth":972,"text":692},{"id":715,"depth":972,"text":715},{"id":732,"depth":967,"text":733,"children":991},[992,993,994,995],{"id":745,"depth":972,"text":745},{"id":842,"depth":972,"text":842},{"id":868,"depth":972,"text":868},{"id":885,"depth":972,"text":885},{"id":899,"depth":967,"text":900},"本文介绍 Castrel 事故排障 Agent 的核心设计理念，包括假设驱动调查、人机协作和业务知识沉淀。","md",{"date":1000,"order":1001,"category":1002,"image":1003},"2026-01-19",1,"产品",{"src":1004},"/images/blog/ai-troubleshooting/incident-investigation-header.png","/zh/blogs/how-castrel-builds-an-incident-troubleshooting-agent",{"ogImage":1007,"title":282,"description":997},"/images/blog/ai-troubleshooting/og_image.png","zh/blogs/1.how-castrel-builds-an-incident-troubleshooting-agent","oA54gBLONysKNIo71wVUq01jbYBLqp4phDUrB4elDM0",1777027426154]