[{"data":1,"prerenderedAt":1011},["ShallowReactive",2],{"navigation_docs_zh":3,"blogs_zh":280},[4,18,51,254,267],{"title":5,"icon":6,"path":7,"stem":8,"children":9,"page":6},"入门",false,"/zh/docs/getting-started","zh/docs/1.getting-started",[10,14],{"title":11,"path":12,"stem":13},"简介","/zh/docs/getting-started/introduction","zh/docs/1.getting-started/1.introduction",{"title":15,"path":16,"stem":17},"快速开始","/zh/docs/getting-started/quick-start","zh/docs/1.getting-started/2.quick-start",{"title":19,"icon":6,"path":20,"stem":21,"children":22,"page":6},"功能","/zh/docs/features","zh/docs/2.features",[23,27,31,35,39,43,47],{"title":24,"path":25,"stem":26},"告警分诊","/zh/docs/features/alert-triage","zh/docs/2.features/2.alert-triage",{"title":28,"path":29,"stem":30},"事故调查","/zh/docs/features/incident-investigation","zh/docs/2.features/3.incident-investigation",{"title":32,"path":33,"stem":34},"部署验证","/zh/docs/features/deployment-verification","zh/docs/2.features/4.deployment-verification",{"title":36,"path":37,"stem":38},"数据探索","/zh/docs/features/data-exploration","zh/docs/2.features/5.data-exploration",{"title":40,"path":41,"stem":42},"知识库","/zh/docs/features/knowledges","zh/docs/2.features/6.knowledges",{"title":44,"path":45,"stem":46},"Castrel Proxy","/zh/docs/features/castrel-proxy","zh/docs/2.features/7.castrel-proxy",{"title":48,"path":49,"stem":50},"自动化","/zh/docs/features/automations","zh/docs/2.features/8.automations",{"title":52,"icon":6,"path":53,"stem":54,"children":55},"集成","/zh/docs/integrations","zh/docs/3.integrations/index",[56,57,62,67,72,77,81,85,89,94,99,104,109,113,117,122,127,131,136,141,146,151,156,160,165,170,174,178,183,188,193,198,203,208,212,216,220,224,229,234,239,244,249],{"title":52,"path":53,"stem":54},{"title":58,"path":59,"stem":60,"icon":61},"Prometheus","/zh/docs/integrations/prometheus","zh/docs/3.integrations/1.prometheus","i-simple-icons-prometheus",{"title":63,"path":64,"stem":65,"icon":66},"AWS","/zh/docs/integrations/aws","zh/docs/3.integrations/10.aws","i-simple-icons-amazonwebservices",{"title":68,"path":69,"stem":70,"icon":71},"阿里云（Aliyun）","/zh/docs/integrations/aliyun","zh/docs/3.integrations/12.aliyun","i-simple-icons-alibabacloud",{"title":73,"path":74,"stem":75,"icon":76},"腾讯云（Tencent Cloud）","/zh/docs/integrations/tencent-cloud","zh/docs/3.integrations/13.tencent-cloud","i-lucide-plug",{"title":78,"path":79,"stem":80,"icon":76},"华为云（Huawei Cloud）","/zh/docs/integrations/huaweicloud","zh/docs/3.integrations/14.huaweicloud",{"title":82,"path":83,"stem":84,"icon":76},"火山引擎（Volcengine）","/zh/docs/integrations/volcengine","zh/docs/3.integrations/15.volcengine",{"title":86,"path":87,"stem":88,"icon":76},"轻帆云（QingFanYun）","/zh/docs/integrations/qingfanyun","zh/docs/3.integrations/16.qingfanyun",{"title":90,"path":91,"stem":92,"icon":93},"Grafana","/zh/docs/integrations/grafana","zh/docs/3.integrations/17.grafana","i-simple-icons-grafana",{"title":95,"path":96,"stem":97,"icon":98},"VictoriaMetrics","/zh/docs/integrations/victoriametrics","zh/docs/3.integrations/18.victoriametrics","i-simple-icons-victoriametrics",{"title":100,"path":101,"stem":102,"icon":103},"New Relic","/zh/docs/integrations/new-relic","zh/docs/3.integrations/19.new-relic","i-simple-icons-newrelic",{"title":105,"path":106,"stem":107,"icon":108},"Elasticsearch","/zh/docs/integrations/elasticsearch","zh/docs/3.integrations/2.elasticsearch","i-simple-icons-elasticsearch",{"title":110,"path":111,"stem":112,"icon":76},"Zabbix","/zh/docs/integrations/zabbix","zh/docs/3.integrations/20.zabbix",{"title":114,"path":115,"stem":116,"icon":76},"监控宝（JianKongBao）","/zh/docs/integrations/jiankongbao","zh/docs/3.integrations/21.jiankongbao",{"title":118,"path":119,"stem":120,"icon":121},"PagerDuty","/zh/docs/integrations/pagerduty","zh/docs/3.integrations/22.pagerduty","i-simple-icons-pagerduty",{"title":123,"path":124,"stem":125,"icon":126},"Sentry","/zh/docs/integrations/sentry","zh/docs/3.integrations/23.sentry","i-simple-icons-sentry",{"title":128,"path":129,"stem":130,"icon":76},"Freshworks / Freshservice","/zh/docs/integrations/freshworks","zh/docs/3.integrations/24.freshworks",{"title":132,"path":133,"stem":134,"icon":135},"Linear","/zh/docs/integrations/linear","zh/docs/3.integrations/25.linear","i-simple-icons-linear",{"title":137,"path":138,"stem":139,"icon":140},"ClickHouse","/zh/docs/integrations/clickhouse","zh/docs/3.integrations/26.clickhouse","i-simple-icons-clickhouse",{"title":142,"path":143,"stem":144,"icon":145},"Kubernetes","/zh/docs/integrations/kubernetes","zh/docs/3.integrations/27.kubernetes","i-simple-icons-kubernetes",{"title":147,"path":148,"stem":149,"icon":150},"Terraform Cloud / HCP Terraform","/zh/docs/integrations/terraform","zh/docs/3.integrations/28.terraform","i-simple-icons-terraform",{"title":152,"path":153,"stem":154,"icon":155},"Jenkins","/zh/docs/integrations/jenkins","zh/docs/3.integrations/29.jenkins","i-simple-icons-jenkins",{"title":157,"path":158,"stem":159,"icon":93},"Grafana Loki","/zh/docs/integrations/grafana-loki","zh/docs/3.integrations/3.grafana-loki",{"title":161,"path":162,"stem":163,"icon":164},"Ansible / AWX","/zh/docs/integrations/ansible","zh/docs/3.integrations/30.ansible","i-simple-icons-ansible",{"title":166,"path":167,"stem":168,"icon":169},"GitLab","/zh/docs/integrations/gitlab","zh/docs/3.integrations/31.gitlab","i-simple-icons-gitlab",{"title":171,"path":172,"stem":173,"icon":76},"钉钉（DingTalk）","/zh/docs/integrations/dingtalk","zh/docs/3.integrations/32.dingtalk",{"title":175,"path":176,"stem":177,"icon":76},"飞书（Feishu / Lark）","/zh/docs/integrations/feishu","zh/docs/3.integrations/33.feishu",{"title":179,"path":180,"stem":181,"icon":182},"Telegram","/zh/docs/integrations/telegram","zh/docs/3.integrations/34.telegram","i-simple-icons-telegram",{"title":184,"path":185,"stem":186,"icon":187},"Email","/zh/docs/integrations/email","zh/docs/3.integrations/35.email","i-simple-icons-gmail",{"title":189,"path":190,"stem":191,"icon":192},"微信企业机器人（Weixin Clawbot）","/zh/docs/integrations/weixin-clawbot","zh/docs/3.integrations/36.weixin-clawbot","i-simple-icons-wechat",{"title":194,"path":195,"stem":196,"icon":197},"Notion","/zh/docs/integrations/notion","zh/docs/3.integrations/37.notion","i-simple-icons-notion",{"title":199,"path":200,"stem":201,"icon":202},"Confluence","/zh/docs/integrations/confluence","zh/docs/3.integrations/38.confluence","i-simple-icons-confluence",{"title":204,"path":205,"stem":206,"icon":207},"Google Docs","/zh/docs/integrations/google-docs","zh/docs/3.integrations/39.google-docs","i-simple-icons-googledocs",{"title":209,"path":210,"stem":211,"icon":93},"Grafana Tempo","/zh/docs/integrations/grafana-tempo","zh/docs/3.integrations/4.grafana-tempo",{"title":213,"path":214,"stem":215,"icon":76},"钉钉文档（DingTalk Docs）","/zh/docs/integrations/dingtalk-docs","zh/docs/3.integrations/40.dingtalk-docs",{"title":217,"path":218,"stem":219,"icon":76},"LDAP","/zh/docs/integrations/ldap","zh/docs/3.integrations/41.ldap",{"title":221,"path":222,"stem":223,"icon":76},"Dify","/zh/docs/integrations/dify","zh/docs/3.integrations/42.dify",{"title":225,"path":226,"stem":227,"icon":228},"自定义 MCP（Custom MCP）","/zh/docs/integrations/custom-mcp","zh/docs/3.integrations/43.custom-mcp","i-simple-icons-anthropic",{"title":230,"path":231,"stem":232,"icon":233},"GitHub","/zh/docs/integrations/github","zh/docs/3.integrations/5.github","i-simple-icons-github",{"title":235,"path":236,"stem":237,"icon":238},"Slack","/zh/docs/integrations/slack","zh/docs/3.integrations/6.slack","i-simple-icons-slack",{"title":240,"path":241,"stem":242,"icon":243},"Vercel","/zh/docs/integrations/vercel","zh/docs/3.integrations/7.vercel","i-simple-icons-vercel",{"title":245,"path":246,"stem":247,"icon":248},"Graylog","/zh/docs/integrations/graylog","zh/docs/3.integrations/8.graylog","i-simple-icons-graylog",{"title":250,"path":251,"stem":252,"icon":253},"Datadog","/zh/docs/integrations/datadog","zh/docs/3.integrations/9.datadog","i-simple-icons-datadog",{"title":255,"path":256,"stem":257,"children":258,"page":6},"更多","/zh/docs/more","zh/docs/4.more",[259,263],{"title":260,"path":261,"stem":262},"路线图","/zh/docs/more/roadmap","zh/docs/4.more/1.roadmap",{"title":264,"path":265,"stem":266},"支持","/zh/docs/more/support","zh/docs/4.more/2.support",{"title":268,"path":269,"stem":270,"children":271,"page":6},"安全","/zh/docs/security","zh/docs/5.security",[272,276],{"title":273,"path":274,"stem":275},"隐私政策","/zh/docs/security/privacy-policy","zh/docs/5.security/1.privacy-policy",{"title":277,"path":278,"stem":279},"服务条款","/zh/docs/security/terms-of-service","zh/docs/5.security/2.terms-of-service",[281],{"id":282,"title":283,"body":284,"description":998,"extension":999,"meta":1000,"navigation":6,"path":1006,"seo":1007,"stem":1009,"__hash__":1010},"blogs_zh/zh/blogs/1.how-castrel-builds-an-incident-troubleshooting-agent.md","Castrel 如何构建事故排障 Agent",{"type":285,"value":286,"toc":967},"minimark",[287,305,308,312,317,319,324,331,334,338,401,404,407,414,430,433,436,456,461,463,467,471,474,480,507,510,513,516,585,588,590,594,600,604,624,627,641,644,655,658,660,664,667,670,673,678,684,691,694,697,711,714,717,720,726,729,731,735,741,744,747,754,823,830,838,841,844,847,867,870,873,884,887,893,896,898,902,964],[288,289,292,293,297,298,297,301,304],"callout",{"color":290,"icon":291},"info","i-lucide-info","本文介绍 Castrel 事故排障 Agent 的核心设计理念，包含：",[294,295,296],"strong",{},"假设驱动调查","、",[294,299,300],{},"人机协作",[294,302,303],{},"业务知识沉淀","，帮助团队从“发现问题”到“定位根因或快速升级”形成高效闭环。",[306,307],"hr",{},[309,310,311],"p",{},"下图展示了 Castrel 事故排障 Agent 的核心工作流。",[313,314],"mermaid",{":config":315,"code":316},"config","%25%25%7Binit%3A%20%7B'flowchart'%3A%20%7B'subGraphTitleMargin'%3A%20%7B'top'%3A%200%2C%20'bottom'%3A%200%7D%7D%7D%7D%25%25%0Agraph%20TB%0A%20%20%20%20subgraph%20main%5B%22%20%22%5D%0A%20%20%20%20%20%20%20%20direction%20LR%0A%20%20%20%20%20%20%20%20A%5B%E7%97%87%E7%8A%B6%5D%0A%0A%20%20%20%20%20%20%20%20subgraph%20Y%5B%22%E4%BA%8B%E6%95%85%E5%88%86%E6%9E%90%22%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20direction%20LR%0A%20%20%20%20%20%20%20%20%20%20%20%20B%5B%E7%94%9F%E6%88%90%E5%81%87%E8%AE%BE%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20C%5B%E6%94%B6%E9%9B%86%E4%B8%8A%E4%B8%8B%E6%96%87%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20D%5B%E9%AA%8C%E8%AF%81%E5%81%87%E8%AE%BE%5D%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20B%20--%3E%20C%0A%20%20%20%20%20%20%20%20%20%20%20%20C%20--%3E%20D%0A%20%20%20%20%20%20%20%20%20%20%20%20D%20--%3E%20B%0A%20%20%20%20%20%20%20%20end%0A%0A%20%20%20%20%20%20%20%20F%5B%E6%8E%92%E9%9A%9C%E6%8A%A5%E5%91%8A%5D%0A%0A%20%20%20%20%20%20%20%20A%20--%3E%20%7C%E8%A7%A6%E5%8F%91%7C%20Y%0A%20%20%20%20%20%20%20%20Y%20--%3E%20%7C%E9%80%80%E5%87%BA%7C%20F%0A%20%20%20%20end%0A%0A%20%20%20%20Z%5B%E4%BA%BA%E7%B1%BB%5D%0A%0A%20%20%20%20subgraph%20X%5B%22%E4%BA%8B%E6%95%85%E7%9F%A5%E8%AF%86%22%5D%0A%20%20%20%20%20%20%20%20X1%5B%22%E5%8E%86%E5%8F%B2%E4%BA%8B%E6%95%85%22%5D%0A%20%20%20%20%20%20%20%20X2%5B%22Runbook%22%5D%0A%20%20%20%20end%0A%0A%20%20%20%20X%20--%3E%20%7C%E5%8F%AC%E5%9B%9E%7C%20main%0A%20%20%20%20main%20--%3E%20%7C%E6%B2%89%E6%B7%80%7C%20X%0A%20%20%20%20Z%20--%3E%20%7C%E5%8F%8D%E9%A6%88%7C%20main%0A%20%20%20%20main%20--%3E%20%7C%E5%B1%95%E7%A4%BA%E7%BB%93%E6%9E%9C%7C%20Z",[306,318],{},[320,321,323],"h2",{"id":322},"_1-可观测性上下文","1. 可观测性上下文",[309,325,326],{},[327,328],"img",{"alt":329,"src":330},"AI Analysis and Observability Data","/images/blog/ai-troubleshooting/observability.png",[309,332,333],{},"AI 排障效果很大程度取决于它能够访问到的上下文数据。完整的可观测性上下文应包含以下维度。",[335,336,337],"h3",{"id":337},"三类核心可观测性数据",[339,340,341,358],"table",{},[342,343,344],"thead",{},[345,346,347,352,355],"tr",{},[348,349,351],"th",{"align":350},"left","数据类型",[348,353,354],{"align":350},"作用",[348,356,357],{"align":350},"常见来源",[359,360,361,375,388],"tbody",{},[345,362,363,369,372],{},[364,365,366],"td",{"align":350},[294,367,368],{},"Metrics",[364,370,371],{"align":350},"发现异常、量化问题严重程度",[364,373,374],{"align":350},"Prometheus、Zabbix、CloudWatch",[345,376,377,382,385],{},[364,378,379],{"align":350},[294,380,381],{},"Logs",[364,383,384],{"align":350},"定位具体错误、获得上下文细节",[364,386,387],{"align":350},"Elasticsearch、Loki、Splunk",[345,389,390,395,398],{},[364,391,392],{"align":350},[294,393,394],{},"Traces",[364,396,397],{"align":350},"追踪请求路径、定位慢调用位置",[364,399,400],{"align":350},"Jaeger、Tempo、SkyWalking",[309,402,403],{},"仅依赖任意单一数据类型都很难高效排障。Metrics 告诉你“出了问题”，Logs 告诉你“具体是什么错”，Traces 告诉你“链路上哪里出了问题”。",[335,405,406],{"id":406},"调用关系与部署关系",[309,408,409,410,413],{},"除了三类可观测性数据，AI 还需要理解系统的",[294,411,412],{},"拓扑关系","：",[415,416,417,424],"ul",{},[418,419,420,423],"li",{},[294,421,422],{},"调用关系","：服务之间的依赖关系（通常由 APM 提供）",[418,425,426,429],{},[294,427,428],{},"部署关系","：服务运行在哪些主机/容器上（可来自 APM、Zabbix 或 Kubernetes）",[309,431,432],{},"有了调用关系，AI 才能判断故障是从上游传导而来，还是当前服务自身问题；有了部署关系，AI 才能关联基础设施层面的异常（例如主机 CPU 飙升、磁盘写满）。",[335,434,435],{"id":435},"实践建议",[415,437,438,444,450],{},[418,439,440,443],{},[294,441,442],{},"优先接入 APM","：APM 通常可同时提供 Traces、调用关系和部署关系，是性价比最高的数据源",[418,445,446,449],{},[294,447,448],{},"补齐基础设施监控","：来自 Zabbix、Node Exporter 等的主机级指标是关键补充",[418,451,452,455],{},[294,453,454],{},"Kubernetes 元数据","：如果使用 K8s，其 Events、Pod 状态、Deployment 变更记录都属于关键上下文",[288,457,460],{"color":458,"icon":459},"primary","i-lucide-trophy","数据越完整，AI 分析越准确。缺少任何一种数据类型都会显著降低排障效率。",[306,462],{},[320,464,466],{"id":465},"_2-假设驱动","2. 假设驱动",[335,468,470],{"id":469},"核心思想像人类-sre-一样思考","核心思想：像人类 SRE 一样思考",[309,472,473],{},"传统 AI 分析方法通常是先收集大量遥测数据，再让模型一次性总结。这种“摘要引擎”模式有明显局限：数据量越大，模型越容易被无关信号干扰，输出质量反而下降。",[309,475,476,477,413],{},"更高效的方式是让 AI ",[294,478,479],{},"像人类 SRE 一样工作",[481,482,483,489,495,501],"ol",{},[418,484,485,488],{},[294,486,487],{},"提出假设","：根据告警和初步数据生成可能根因假设",[418,490,491,494],{},[294,492,493],{},"验证假设","：针对每条假设查询特定遥测数据进行验证",[418,496,497,500],{},[294,498,499],{},"递归下钻","：当某条假设被验证后，继续生成更深一层子假设",[418,502,503,506],{},[294,504,505],{},"及时剪枝","：当某条假设被证伪时，立刻剪掉该分支，聚焦其他方向",[335,508,509],{"id":509},"假设分支策略",[313,511],{":config":315,"code":512},"flowchart%20TD%0A%20%20%20%20A%5B%E5%91%8A%E8%AD%A6%EF%BC%9AAPI%20P95%20%E5%BB%B6%E8%BF%9F%E7%AA%81%E5%A2%9E%5D%20--%3E%20B%5B%E6%8F%90%E5%87%BA%E5%88%9D%E5%A7%8B%E5%81%87%E8%AE%BE%5D%0A%0A%20%20%20%20B%20--%3E%20C%7B%E5%81%87%E8%AE%BE%201%EF%BC%9A%E6%95%B0%E6%8D%AE%E5%BA%93%E6%80%A7%E8%83%BD%E9%97%AE%E9%A2%98%7D%0A%20%20%20%20B%20--%3E%20D%7B%E5%81%87%E8%AE%BE%202%EF%BC%9A%E7%BD%91%E7%BB%9C%E6%97%B6%E5%BB%B6%7D%0A%20%20%20%20B%20--%3E%20E%7B%E5%81%87%E8%AE%BE%203%EF%BC%9A%E8%B5%84%E6%BA%90%E4%B8%8D%E8%B6%B3%7D%0A%0A%20%20%20%20C%20--%3E%7C%22%E6%9F%A5%E8%AF%A2%20DB%20%E6%8C%87%E6%A0%87%EF%BC%8C%E5%8F%91%E7%8E%B0%E6%85%A2%E6%9F%A5%E8%AF%A2%E5%A2%9E%E5%8A%A0%22%7C%20F%5B%E9%AA%8C%E8%AF%81%E9%80%9A%E8%BF%87%5D%0A%20%20%20%20D%20--%3E%7C%22%E6%A3%80%E6%9F%A5%E7%BD%91%E7%BB%9C%E6%8C%87%E6%A0%87%EF%BC%8C%E6%97%A0%E5%BC%82%E5%B8%B8%22%7C%20G%5B%E6%8E%92%E9%99%A4%5D%0A%20%20%20%20E%20--%3E%7C%22%E6%A3%80%E6%9F%A5%20CPU%2F%E5%86%85%E5%AD%98%EF%BC%8C%E8%B5%84%E6%BA%90%E5%85%85%E8%B6%B3%22%7C%20G%0A%0A%20%20%20%20F%20--%3E%20H%7B%E5%AD%90%E5%81%87%E8%AE%BE%201.1%EF%BC%9A%E6%89%A7%E8%A1%8C%E8%AE%A1%E5%88%92%E5%8F%98%E5%8C%96%7D%0A%20%20%20%20F%20--%3E%20I%7B%E5%AD%90%E5%81%87%E8%AE%BE%201.2%EF%BC%9A%E8%BF%9E%E6%8E%A5%E6%B1%A0%E8%80%97%E5%B0%BD%7D%0A%0A%20%20%20%20H%20--%3E%7C%22%E5%AF%B9%E6%AF%94%E6%89%A7%E8%A1%8C%E8%AE%A1%E5%88%92%EF%BC%8C%E5%8F%91%E7%8E%B0%E5%85%A8%E8%A1%A8%E6%89%AB%E6%8F%8F%22%7C%20J%5B%E9%AA%8C%E8%AF%81%E9%80%9A%E8%BF%87%5D%0A%20%20%20%20I%20--%3E%7C%22%E8%BF%9E%E6%8E%A5%E6%95%B0%E6%AD%A3%E5%B8%B8%22%7C%20G%0A%0A%20%20%20%20J%20--%3E%20K%7B%E5%AD%90%E5%81%87%E8%AE%BE%201.1.1%EF%BC%9A%E7%B4%A2%E5%BC%95%E5%8F%98%E6%9B%B4%7D%0A%20%20%20%20J%20--%3E%20L%7B%E5%AD%90%E5%81%87%E8%AE%BE%201.1.2%EF%BC%9A%E6%95%B0%E6%8D%AE%E9%87%8F%E6%BF%80%E5%A2%9E%7D%0A%0A%20%20%20%20K%20--%3E%7C%22%E6%A3%80%E6%9F%A5%20DDL%20%E8%AE%B0%E5%BD%95%EF%BC%8C%E5%8F%91%E7%8E%B0%E7%B4%A2%E5%BC%95%E8%A2%AB%E5%88%A0%E9%99%A4%22%7C%20M%5B%E6%A0%B9%E5%9B%A0%E7%A1%AE%E8%AE%A4%5D%0A%20%20%20%20L%20--%3E%7C%22%E6%95%B0%E6%8D%AE%E9%87%8F%E7%A8%B3%E5%AE%9A%22%7C%20G",[335,514,515],{"id":515},"与传统方法对比",[339,517,518,531],{},[342,519,520],{},[345,521,522,525,528],{},[348,523,524],{"align":350},"维度",[348,526,527],{"align":350},"传统摘要模式",[348,529,530],{"align":350},"假设驱动模式",[359,532,533,546,559,572],{},[345,534,535,540,543],{},[364,536,537],{"align":350},[294,538,539],{},"数据处理方式",[364,541,542],{"align":350},"一次性收集全部数据",[364,544,545],{"align":350},"按需查询特定数据",[345,547,548,553,556],{},[364,549,550],{"align":350},[294,551,552],{},"噪音干扰",[364,554,555],{"align":350},"易被无关异常带偏",[364,557,558],{"align":350},"聚焦因果链路",[345,560,561,566,569],{},[364,562,563],{"align":350},[294,564,565],{},"调查深度",[364,567,568],{"align":350},"停留在表层症状",[364,570,571],{"align":350},"递归下钻到根因",[345,573,574,579,582],{},[364,575,576],{"align":350},[294,577,578],{},"可解释性",[364,580,581],{"align":350},"结论难以追溯",[364,583,584],{"align":350},"具备完整假设验证链",[288,586,587],{"color":458,"icon":459},"假设驱动调查让 AI 的分析过程透明、可追溯，每个结论都有数据支撑。",[306,589],{},[320,591,593],{"id":592},"_3-人机协作","3. 人机协作",[309,595,596,597,413],{},"假设驱动调查要发挥最大效果，需要人和 AI 双向协作。传统 AI 分析是单向的：AI 给结论，用户接受或拒绝。更高效的模式是",[294,598,599],{},"双向协作",[335,601,603],{"id":602},"ai-的角色","AI 的角色",[415,605,606,612,618],{},[418,607,608,611],{},[294,609,610],{},"掌握通用知识","：理解系统架构、故障模式和排障方法论",[418,613,614,617],{},[294,615,616],{},"快速扫描数据","：在海量 Metrics、Logs、Traces 中快速定位异常",[418,619,620,623],{},[294,621,622],{},"并行验证假设","：同时验证多个可能的根因方向",[335,625,626],{"id":626},"人的角色",[415,628,629,635],{},[418,630,631,634],{},[294,632,633],{},"业务上下文","：告诉 AI 业务逻辑，比如“晚上 9 点 CPU 高是活动流量，属正常现象”",[418,636,637,640],{},[294,638,639],{},"经验输入","：提供历史经验，比如“我们网络一直不稳，上次是 DNS 抖动”",[335,642,643],{"id":643},"典型协作场景",[645,646,651],"pre",{"className":647,"code":649,"language":650},[648],"language-text","AI:   （完成初步假设分析）\n      \"已验证 3 条假设：\n       ✓ 数据库慢查询增加 - 与告警时间匹配\n       ✗ 网络时延 - 无异常\n       ✗ 资源不足 - CPU/内存充足\n\n       正在沿数据库方向继续深挖...\"\n\nUser: （看到假设列表，想起上周变更）\n      \"DBA 上周改过订单表索引\"\n\nAI:   （新增假设并验证）\n      \"新增假设：索引变更导致执行计划变化\n       → 检查 DDL 变更记录：发现上周四删除了订单表索引\n       → 对比执行计划：删除前走索引扫描，删除后变为全表扫描\n       → 时间关联：索引删除时间与慢查询开始时间一致\n       ✓ 假设成立，确认根因\"\n\nUser: （验证成功）\n      \"确认了！需要恢复索引。\"\n","text",[652,653,649],"code",{"__ignoreMap":654},"",[288,656,657],{"color":458,"icon":459},"AI 擅长处理海量数据和通用知识，人擅长提供业务上下文和历史经验。双向协作能让排障效率远超纯 AI 或纯人工。",[306,659],{},[320,661,663],{"id":662},"_4-退出策略","4. 退出策略",[309,665,666],{},"AI 并不总能直接找到根因，尤其在数据接入不完整时。但这并不意味着 AI 分析没有价值。",[335,668,669],{"id":669},"多组件问题的深度排查",[309,671,672],{},"复杂事故的根因可能横跨多个系统，或需要多层推理才能找到。假设驱动允许 AI 递归下钻，直到搜索空间被穷尽。",[309,674,675],{},[294,676,677],{},"案例：Pod 频繁重启（CrashLoopBackOff）",[645,679,682],{"className":680,"code":681,"language":650},[648],"告警：Kubernetes Pod 进入 CrashLoopBackOff 状态\n\n第一层分析：\n  → 假设：内存不足触发 OOM Kill\n  → 验证：检查 Pod 事件，确认 OOMKilled\n  → 结论：假设成立，但这只是表层原因\n\n第二层分析（递归深挖）：\n  → 假设：请求负载异常增大导致内存突增\n  → 验证：检查入流量，发现 Kafka 消息体异常变大\n  → 结论：假设成立，继续下钻\n\n第三层分析：\n  → 假设：上游系统发送了异常大消息\n  → 验证：追查消息来源，发现某批次数据包含损坏的大文件\n  → 结论：根因确认 - 上游数据异常导致消息尺寸溢出\n",[652,683,681],{"__ignoreMap":654},[309,685,686,687,690],{},"早期版本 AI 可能在第一层就停止，只给出“Pod OOM”结论。但这对工程师帮助有限，因为告警本身已经告诉了这件事。真正有价值的是找到",[294,688,689],{},"为什么会 OOM","。",[335,692,693],{"id":693},"排除干扰项的价值",[309,695,696],{},"即使 AI 缺少足够数据、无法直接锁定根因，它通常仍能：",[481,698,699,705],{},[418,700,701,704],{},[294,702,703],{},"指出大致排查方向","：比如“问题更可能在数据库层”或“与最近部署变更相关”",[418,706,707,710],{},[294,708,709],{},"排除无关干扰项","：比如确认网络连通正常、资源利用率充足、缓存命中率无异常",[309,712,713],{},"这种“排除法”本身就能节省大量时间。传统排障里，工程师往往要先逐项检查网络、资源、缓存等基础设施，再排除这些可能性。AI 能在几分钟内完成这一步，让用户直接聚焦最可能的问题方向。",[335,715,716],{"id":716},"上下文交接",[309,718,719],{},"当 AI 因数据不足无法继续深挖时，它可以向用户输出结构化上下文交接：",[645,721,724],{"className":722,"code":723,"language":650},[648],"📋 排查进度交接\n\n⏱️ 分析耗时：5 分钟 | 已扫描组件：12 个\n\n✅ 已排除：\n• 网络连通正常（Ping \u003C1ms，无丢包）\n• K8s 资源充足（CPU \u003C60%，Memory \u003C70%）\n• 缓存命中率正常（Redis 99.2%）\n\n🎯 重点方向：\n• 问题集中在 order-service → mysql-cluster 链路\n• 数据库性能相关问题概率较高\n\n⚠️ 需人工确认（缺失数据源）：\n• 数据库慢查询日志（未接入）\n• 最近 Schema 变更记录（未接入）\n",[652,725,723],{"__ignoreMap":654},[288,727,728],{"color":458,"icon":459},"前期扫描结果不会浪费。即使 AI 不能给出最终答案，用户也能从更小的排查范围起步，而不是从零开始。",[306,730],{},[320,732,734],{"id":733},"_5-知识沉淀","5. 知识沉淀",[309,736,737],{},[327,738],{"alt":739,"src":740},"Knowledge Accumulation and Experience Reuse","/images/blog/ai-troubleshooting/knowledge-accumulation.png",[309,742,743],{},"如果没有 SOP 或 Runbook，AI 首次遇到某类问题时可能需要大量探索。但这些探索结果不应被浪费。",[335,745,746],{"id":746},"因果验证的复杂性",[309,748,749,750,753],{},"假设驱动调查的核心是",[294,751,752],{},"验证因果关系","，即判断某个异常是否真的导致了当前告警。但因果验证远比看起来复杂：",[339,755,756,769],{},[342,757,758],{},[345,759,760,763,766],{},[348,761,762],{"align":350},"验证维度",[348,764,765],{"align":350},"说明",[348,767,768],{"align":350},"挑战",[359,770,771,784,797,810],{},[345,772,773,778,781],{},[364,774,775],{"align":350},[294,776,777],{},"时间相关性",[364,779,780],{"align":350},"异常出现时间是否与告警时间匹配",[364,782,783],{"align":350},"分布式系统中可能存在时间戳偏移",[345,785,786,791,794],{},[364,787,788],{"align":350},[294,789,790],{},"传播路径",[364,792,793],{"align":350},"异常是否位于告警对象的上下游链路",[364,795,796],{"align":350},"需要完整调用拓扑图",[345,798,799,804,807],{},[364,800,801],{"align":350},[294,802,803],{},"影响范围",[364,805,806],{"align":350},"异常影响的资源是否与告警相关",[364,808,809],{"align":350},"需要理解资源之间依赖关系",[345,811,812,817,820],{},[364,813,814],{"align":350},[294,815,816],{},"业务语义",[364,818,819],{"align":350},"该异常在业务层面是否合理",[364,821,822],{"align":350},"需要深度理解业务逻辑",[309,824,825,826,829],{},"其中最后一项“业务语义”尤其依赖",[294,827,828],{},"对客户业务的深度理解","。例如：",[415,831,832,835],{},[418,833,834],{},"订单服务延迟升高，AI 发现数据库有慢查询。但这个慢查询究竟是“每天零点跑、与线上业务无关的报表任务”，还是核心下单查询？只有懂业务的人才能判断。",[418,836,837],{},"某服务错误率上升，AI 发现最近有代码部署。但这个部署是“新功能灰度（预期会有少量错误）”，还是意外 bug？这需要结合发布计划判断。",[309,839,840],{},"这类业务知识无法直接从遥测数据里获得，必须通过知识沉淀逐步积累。",[335,842,843],{"id":843},"从排障过程沉淀知识",[309,845,846],{},"当一次事故调查结束后，AI 可将过程总结为知识条目：",[415,848,849,855,861],{},[418,850,851,854],{},[294,852,853],{},"问题特征","：本次调查由哪些告警/症状组合触发",[418,856,857,860],{},[294,858,859],{},"调查路径","：尝试了哪些方向，最终定位到什么根因",[418,862,863,866],{},[294,864,865],{},"解决方案","：如何修复、需要注意什么",[335,868,869],{"id":869},"绑定到特定告警与资源",[309,871,872],{},"这些知识可绑定到具体告警类型或资源。当下次遇到相似问题时：",[481,874,875,878,881],{},[418,876,877],{},"AI 自动召回相关知识",[418,879,880],{},"参考上次调查路径，快速确认是否同类问题",[418,882,883],{},"如果症状一致，直接给出修复建议；即便不一致，也能先排除该方向",[335,885,886],{"id":886},"示例场景",[645,888,891],{"className":889,"code":890,"language":650},[648],"第一次：\n• 告警：order-service P95 延迟上升\n• 调查过程：查网络 → 查资源 → 查数据库 → 发现索引问题\n• 沉淀知识：绑定到 order-service + 延迟类告警\n\n第二次：\n• 同类告警再次触发\n• AI 自动关联知识：\"上次相似问题由索引导致，是否优先检查数据库？\"\n• 用户确认后，直接跳过网络和资源排查，进入数据库检查\n• 调查耗时从 30 分钟降到 5 分钟\n",[652,892,890],{"__ignoreMap":654},[288,894,895],{"color":458,"icon":459},"因果验证的准确性取决于对业务的深度理解。通过知识沉淀，团队经验不再只存在于个人脑中，而是成为 AI 做因果判断的重要依据。",[306,897],{},[320,899,901],{"id":900},"_6-总结","6. 总结",[339,903,904,913],{},[342,905,906],{},[345,907,908,911],{},[348,909,910],{"align":350},"能力",[348,912,765],{"align":350},[359,914,915,925,935,944,954],{},[345,916,917,922],{},[364,918,919],{"align":350},[294,920,921],{},"可观测性上下文",[364,923,924],{"align":350},"集成 Metrics、Logs、Traces 与调用拓扑",[345,926,927,932],{},[364,928,929],{"align":350},[294,930,931],{},"假设驱动",[364,933,934],{"align":350},"提假设 → 验证 → 递归下钻，而非简单摘要",[345,936,937,941],{},[364,938,939],{"align":350},[294,940,300],{},[364,942,943],{"align":350},"AI 扫描数据，人类提供业务上下文和历史经验",[345,945,946,951],{},[364,947,948],{"align":350},[294,949,950],{},"退出策略",[364,952,953],{"align":350},"即便无法锁定根因，也能排除干扰项并输出关键结论",[345,955,956,961],{},[364,957,958],{"align":350},[294,959,960],{},"知识沉淀",[364,962,963],{"align":350},"沉淀业务知识，提升后续排障准确性与效率",[309,965,966],{},"Castrel 事故排障 Agent 的目标不是“AI 取代人”，而是让人机协作效率显著超过纯 AI 或纯人工。",{"title":654,"searchDepth":968,"depth":968,"links":969},2,[970,976,981,986,991,997],{"id":322,"depth":968,"text":323,"children":971},[972,974,975],{"id":337,"depth":973,"text":337},3,{"id":406,"depth":973,"text":406},{"id":435,"depth":973,"text":435},{"id":465,"depth":968,"text":466,"children":977},[978,979,980],{"id":469,"depth":973,"text":470},{"id":509,"depth":973,"text":509},{"id":515,"depth":973,"text":515},{"id":592,"depth":968,"text":593,"children":982},[983,984,985],{"id":602,"depth":973,"text":603},{"id":626,"depth":973,"text":626},{"id":643,"depth":973,"text":643},{"id":662,"depth":968,"text":663,"children":987},[988,989,990],{"id":669,"depth":973,"text":669},{"id":693,"depth":973,"text":693},{"id":716,"depth":973,"text":716},{"id":733,"depth":968,"text":734,"children":992},[993,994,995,996],{"id":746,"depth":973,"text":746},{"id":843,"depth":973,"text":843},{"id":869,"depth":973,"text":869},{"id":886,"depth":973,"text":886},{"id":900,"depth":968,"text":901},"本文介绍 Castrel 事故排障 Agent 的核心设计理念，包括假设驱动调查、人机协作和业务知识沉淀。","md",{"date":1001,"order":1002,"category":1003,"image":1004},"2026-01-19",1,"产品",{"src":1005},"/images/blog/ai-troubleshooting/incident-investigation-header.png","/zh/blogs/how-castrel-builds-an-incident-troubleshooting-agent",{"ogImage":1008,"title":283,"description":998},"/images/blog/ai-troubleshooting/og_image.png","zh/blogs/1.how-castrel-builds-an-incident-troubleshooting-agent","oA54gBLONysKNIo71wVUq01jbYBLqp4phDUrB4elDM0",1777027422944]