[{"data":1,"prerenderedAt":853},["ShallowReactive",2],{"navigation_docs_zh":3,"docs_zh_-zh-docs-features-incident-investigation":280,"docs_zh_-zh-docs-features-incident-investigation_surround":848},[4,18,51,254,267],{"title":5,"icon":6,"path":7,"stem":8,"children":9,"page":6},"入门",false,"/zh/docs/getting-started","zh/docs/1.getting-started",[10,14],{"title":11,"path":12,"stem":13},"简介","/zh/docs/getting-started/introduction","zh/docs/1.getting-started/1.introduction",{"title":15,"path":16,"stem":17},"快速开始","/zh/docs/getting-started/quick-start","zh/docs/1.getting-started/2.quick-start",{"title":19,"icon":6,"path":20,"stem":21,"children":22,"page":6},"功能","/zh/docs/features","zh/docs/2.features",[23,27,31,35,39,43,47],{"title":24,"path":25,"stem":26},"告警分诊","/zh/docs/features/alert-triage","zh/docs/2.features/2.alert-triage",{"title":28,"path":29,"stem":30},"事故调查","/zh/docs/features/incident-investigation","zh/docs/2.features/3.incident-investigation",{"title":32,"path":33,"stem":34},"部署验证","/zh/docs/features/deployment-verification","zh/docs/2.features/4.deployment-verification",{"title":36,"path":37,"stem":38},"数据探索","/zh/docs/features/data-exploration","zh/docs/2.features/5.data-exploration",{"title":40,"path":41,"stem":42},"知识库","/zh/docs/features/knowledges","zh/docs/2.features/6.knowledges",{"title":44,"path":45,"stem":46},"Castrel Proxy","/zh/docs/features/castrel-proxy","zh/docs/2.features/7.castrel-proxy",{"title":48,"path":49,"stem":50},"自动化","/zh/docs/features/automations","zh/docs/2.features/8.automations",{"title":52,"icon":6,"path":53,"stem":54,"children":55},"集成","/zh/docs/integrations","zh/docs/3.integrations/index",[56,57,62,67,72,77,81,85,89,94,99,104,109,113,117,122,127,131,136,141,146,151,156,160,165,170,174,178,183,188,193,198,203,208,212,216,220,224,229,234,239,244,249],{"title":52,"path":53,"stem":54},{"title":58,"path":59,"stem":60,"icon":61},"Prometheus","/zh/docs/integrations/prometheus","zh/docs/3.integrations/1.prometheus","i-simple-icons-prometheus",{"title":63,"path":64,"stem":65,"icon":66},"AWS","/zh/docs/integrations/aws","zh/docs/3.integrations/10.aws","i-simple-icons-amazonwebservices",{"title":68,"path":69,"stem":70,"icon":71},"阿里云（Aliyun）","/zh/docs/integrations/aliyun","zh/docs/3.integrations/12.aliyun","i-simple-icons-alibabacloud",{"title":73,"path":74,"stem":75,"icon":76},"腾讯云（Tencent Cloud）","/zh/docs/integrations/tencent-cloud","zh/docs/3.integrations/13.tencent-cloud","i-lucide-plug",{"title":78,"path":79,"stem":80,"icon":76},"华为云（Huawei Cloud）","/zh/docs/integrations/huaweicloud","zh/docs/3.integrations/14.huaweicloud",{"title":82,"path":83,"stem":84,"icon":76},"火山引擎（Volcengine）","/zh/docs/integrations/volcengine","zh/docs/3.integrations/15.volcengine",{"title":86,"path":87,"stem":88,"icon":76},"轻帆云（QingFanYun）","/zh/docs/integrations/qingfanyun","zh/docs/3.integrations/16.qingfanyun",{"title":90,"path":91,"stem":92,"icon":93},"Grafana","/zh/docs/integrations/grafana","zh/docs/3.integrations/17.grafana","i-simple-icons-grafana",{"title":95,"path":96,"stem":97,"icon":98},"VictoriaMetrics","/zh/docs/integrations/victoriametrics","zh/docs/3.integrations/18.victoriametrics","i-simple-icons-victoriametrics",{"title":100,"path":101,"stem":102,"icon":103},"New Relic","/zh/docs/integrations/new-relic","zh/docs/3.integrations/19.new-relic","i-simple-icons-newrelic",{"title":105,"path":106,"stem":107,"icon":108},"Elasticsearch","/zh/docs/integrations/elasticsearch","zh/docs/3.integrations/2.elasticsearch","i-simple-icons-elasticsearch",{"title":110,"path":111,"stem":112,"icon":76},"Zabbix","/zh/docs/integrations/zabbix","zh/docs/3.integrations/20.zabbix",{"title":114,"path":115,"stem":116,"icon":76},"监控宝（JianKongBao）","/zh/docs/integrations/jiankongbao","zh/docs/3.integrations/21.jiankongbao",{"title":118,"path":119,"stem":120,"icon":121},"PagerDuty","/zh/docs/integrations/pagerduty","zh/docs/3.integrations/22.pagerduty","i-simple-icons-pagerduty",{"title":123,"path":124,"stem":125,"icon":126},"Sentry","/zh/docs/integrations/sentry","zh/docs/3.integrations/23.sentry","i-simple-icons-sentry",{"title":128,"path":129,"stem":130,"icon":76},"Freshworks / Freshservice","/zh/docs/integrations/freshworks","zh/docs/3.integrations/24.freshworks",{"title":132,"path":133,"stem":134,"icon":135},"Linear","/zh/docs/integrations/linear","zh/docs/3.integrations/25.linear","i-simple-icons-linear",{"title":137,"path":138,"stem":139,"icon":140},"ClickHouse","/zh/docs/integrations/clickhouse","zh/docs/3.integrations/26.clickhouse","i-simple-icons-clickhouse",{"title":142,"path":143,"stem":144,"icon":145},"Kubernetes","/zh/docs/integrations/kubernetes","zh/docs/3.integrations/27.kubernetes","i-simple-icons-kubernetes",{"title":147,"path":148,"stem":149,"icon":150},"Terraform Cloud / HCP Terraform","/zh/docs/integrations/terraform","zh/docs/3.integrations/28.terraform","i-simple-icons-terraform",{"title":152,"path":153,"stem":154,"icon":155},"Jenkins","/zh/docs/integrations/jenkins","zh/docs/3.integrations/29.jenkins","i-simple-icons-jenkins",{"title":157,"path":158,"stem":159,"icon":93},"Grafana Loki","/zh/docs/integrations/grafana-loki","zh/docs/3.integrations/3.grafana-loki",{"title":161,"path":162,"stem":163,"icon":164},"Ansible / AWX","/zh/docs/integrations/ansible","zh/docs/3.integrations/30.ansible","i-simple-icons-ansible",{"title":166,"path":167,"stem":168,"icon":169},"GitLab","/zh/docs/integrations/gitlab","zh/docs/3.integrations/31.gitlab","i-simple-icons-gitlab",{"title":171,"path":172,"stem":173,"icon":76},"钉钉（DingTalk）","/zh/docs/integrations/dingtalk","zh/docs/3.integrations/32.dingtalk",{"title":175,"path":176,"stem":177,"icon":76},"飞书（Feishu / Lark）","/zh/docs/integrations/feishu","zh/docs/3.integrations/33.feishu",{"title":179,"path":180,"stem":181,"icon":182},"Telegram","/zh/docs/integrations/telegram","zh/docs/3.integrations/34.telegram","i-simple-icons-telegram",{"title":184,"path":185,"stem":186,"icon":187},"Email","/zh/docs/integrations/email","zh/docs/3.integrations/35.email","i-simple-icons-gmail",{"title":189,"path":190,"stem":191,"icon":192},"微信企业机器人（Weixin Clawbot）","/zh/docs/integrations/weixin-clawbot","zh/docs/3.integrations/36.weixin-clawbot","i-simple-icons-wechat",{"title":194,"path":195,"stem":196,"icon":197},"Notion","/zh/docs/integrations/notion","zh/docs/3.integrations/37.notion","i-simple-icons-notion",{"title":199,"path":200,"stem":201,"icon":202},"Confluence","/zh/docs/integrations/confluence","zh/docs/3.integrations/38.confluence","i-simple-icons-confluence",{"title":204,"path":205,"stem":206,"icon":207},"Google Docs","/zh/docs/integrations/google-docs","zh/docs/3.integrations/39.google-docs","i-simple-icons-googledocs",{"title":209,"path":210,"stem":211,"icon":93},"Grafana Tempo","/zh/docs/integrations/grafana-tempo","zh/docs/3.integrations/4.grafana-tempo",{"title":213,"path":214,"stem":215,"icon":76},"钉钉文档（DingTalk Docs）","/zh/docs/integrations/dingtalk-docs","zh/docs/3.integrations/40.dingtalk-docs",{"title":217,"path":218,"stem":219,"icon":76},"LDAP","/zh/docs/integrations/ldap","zh/docs/3.integrations/41.ldap",{"title":221,"path":222,"stem":223,"icon":76},"Dify","/zh/docs/integrations/dify","zh/docs/3.integrations/42.dify",{"title":225,"path":226,"stem":227,"icon":228},"自定义 MCP（Custom MCP）","/zh/docs/integrations/custom-mcp","zh/docs/3.integrations/43.custom-mcp","i-simple-icons-anthropic",{"title":230,"path":231,"stem":232,"icon":233},"GitHub","/zh/docs/integrations/github","zh/docs/3.integrations/5.github","i-simple-icons-github",{"title":235,"path":236,"stem":237,"icon":238},"Slack","/zh/docs/integrations/slack","zh/docs/3.integrations/6.slack","i-simple-icons-slack",{"title":240,"path":241,"stem":242,"icon":243},"Vercel","/zh/docs/integrations/vercel","zh/docs/3.integrations/7.vercel","i-simple-icons-vercel",{"title":245,"path":246,"stem":247,"icon":248},"Graylog","/zh/docs/integrations/graylog","zh/docs/3.integrations/8.graylog","i-simple-icons-graylog",{"title":250,"path":251,"stem":252,"icon":253},"Datadog","/zh/docs/integrations/datadog","zh/docs/3.integrations/9.datadog","i-simple-icons-datadog",{"title":255,"path":256,"stem":257,"children":258,"page":6},"更多","/zh/docs/more","zh/docs/4.more",[259,263],{"title":260,"path":261,"stem":262},"路线图","/zh/docs/more/roadmap","zh/docs/4.more/1.roadmap",{"title":264,"path":265,"stem":266},"支持","/zh/docs/more/support","zh/docs/4.more/2.support",{"title":268,"path":269,"stem":270,"children":271,"page":6},"安全","/zh/docs/security","zh/docs/5.security",[272,276],{"title":273,"path":274,"stem":275},"隐私政策","/zh/docs/security/privacy-policy","zh/docs/5.security/1.privacy-policy",{"title":277,"path":278,"stem":279},"服务条款","/zh/docs/security/terms-of-service","zh/docs/5.security/2.terms-of-service",{"id":281,"title":28,"body":282,"description":841,"extension":842,"links":843,"meta":844,"navigation":845,"path":29,"seo":846,"stem":30,"__hash__":847},"docs_zh/zh/docs/2.features/3.incident-investigation.md",{"type":283,"value":284,"toc":832},"minimark",[285,289,294,301,317,320,326,329,369,374,377,398,403,406,468,471,478,482,485,553,556,564,568,571,623,626,631,634,645,651,654,658,661,666,669,683,688,691,705,710,713,724,729,732,746,749,806,809,814,818,822,828],[286,287,288],"p",{},"事故调查是 Castrel 的核心能力之一，它把 AI 分析与人的专业经验结合起来，帮助 SRE 团队快速识别根因。无论你是在手机上响应告警，还是在桌面前做深入排查，Castrel 都能为不同场景提供合适的工具。",[290,291,293],"h2",{"id":292},"什么是事故调查","什么是事故调查？",[286,295,296,297,300],{},"事故调查是一套由 AI 驱动的根因分析系统，帮助你定位线上问题的来源。当某条告警在 ",[298,299,24],"a",{"href":25}," 中被确认为事故后，Castrel 会自动扫描你的基础设施，例如 K8s 事件、Pod 日志、数据库指标等，以识别潜在根因，并可视化故障在系统中的传播路径。",[286,302,303,304,308,309,312,313,316],{},"与传统监控工具只能告诉你 ",[305,306,307],"em",{},"出了什么问题"," 不同，事故调查会进一步告诉你 ",[305,310,311],{},"为什么会发生","、",[305,314,315],{},"该从哪里查起","，让你能更快地处理事故。",[290,318,319],{"id":319},"如何使用事故调查",[286,321,322],{},[323,324,325],"strong",{},"1. 发起调查",[286,327,328],{},"你可以通过两种方式启动事故调查：",[330,331,332,343],"ul",{},[333,334,335,338,339,342],"li",{},[323,336,337],{},"从告警分诊进入","：当某条告警被分类为 ",[323,340,341],{},"事故"," 后，点击 “Start Investigation” 开始根因分析",[333,344,345,348,349],{},[323,346,347],{},"手动触发","：在 Castrel 界面中点击 “Start Investigation”，并配置：\n",[330,350,351,357,363],{},[333,352,353,356],{},[323,354,355],{},"时间范围","：当前（最近 1 小时）、最近告警时间，或自定义时间段",[333,358,359,362],{},[323,360,361],{},"Application","：选择受影响的应用或服务",[333,364,365,368],{},[323,366,367],{},"补充上下文","（可选）：粘贴告警内容、指定资源，或描述你观察到的症状",[286,370,371],{},[323,372,373],{},"2. 查看分析报告",[286,375,376],{},"启动调查后，Castrel 会做一次全面扫描，并生成包含以下内容的分析报告：",[378,379,380,386,392],"ol",{},[333,381,382,385],{},[323,383,384],{},"假设列表","：AI 生成的潜在根因，每个假设都附带支撑证据",[333,387,388,391],{},[323,389,390],{},"传播拓扑","：可视化展示故障如何在服务之间传播",[333,393,394,397],{},[323,395,396],{},"证据摘要","：包括日志、指标、代码变更、事件等关键数据点",[286,399,400],{},[323,401,402],{},"3. 选择下一步",[286,404,405],{},"根据分析结果，你可以走三条路径：",[407,408,409,425],"table",{},[410,411,412],"thead",{},[413,414,415,419,422],"tr",{},[416,417,418],"th",{},"场景",[416,420,421],{},"动作",[416,423,424],{},"说明",[426,427,428,442,455],"tbody",{},[413,429,430,436,439],{},[431,432,433],"td",{},[323,434,435],{},"根因很明确",[431,437,438],{},"Confirm & Close",[431,440,441],{},"AI 已找到证据充分的明确根因。你只需复核并确认，即可结束调查",[413,443,444,449,452],{},[431,445,446],{},[323,447,448],{},"仍需人工深挖",[431,450,451],{},"Get Report",[431,453,454],{},"下载一份上下文摘要，其中包含已排除的可能性和仍待确认的方向，便于继续手动排查",[413,456,457,462,465],{},[431,458,459],{},[323,460,461],{},"方向对了但还不够",[431,463,464],{},"Provide Guidance",[431,466,467],{},"利用你的领域知识，引导 AI 向某个方向继续深入",[290,469,470],{"id":470},"人机协同调查",[286,472,473,474,477],{},"事故调查不是让你被动接收 AI 结果，而是为 ",[323,475,476],{},"双向协作"," 设计的。你可以主动利用自己的领域知识，引导调查过程。",[286,479,480],{},[323,481,390],{},[286,483,484],{},"传播拓扑会把故障在系统中的传播路径分为四层：",[407,486,487,499],{},[410,488,489],{},[413,490,491,494,497],{},[416,492,493],{},"层级",[416,495,496],{},"图标",[416,498,424],{},[426,500,501,514,527,540],{},[413,502,503,508,511],{},[431,504,505],{},[323,506,507],{},"根因",[431,509,510],{},"🔴",[431,512,513],{},"故障的源头",[413,515,516,521,524],{},[431,517,518],{},[323,519,520],{},"关键传播",[431,522,523],{},"🟠",[431,525,526],{},"故障传播过程中的关键节点",[413,528,529,534,537],{},[431,530,531],{},[323,532,533],{},"直接影响",[431,535,536],{},"🟡",[431,538,539],{},"被故障直接影响的服务",[413,541,542,547,550],{},[431,543,544],{},[323,545,546],{},"间接影响",[431,548,549],{},"⚪",[431,551,552],{},"经由多跳传播后受到影响的边缘服务",[286,554,555],{},"你可以在拓扑中做这些操作：",[330,557,558,561],{},[333,559,560],{},"将某个节点标记为疑似根因，以便做更聚焦的分析",[333,562,563],{},"查看传播路径，理解爆炸半径",[286,565,566],{},[323,567,384],{},[286,569,570],{},"假设列表展示 AI 生成的潜在根因，你可以对它们进行管理：",[407,572,573,581],{},[410,574,575],{},[413,576,577,579],{},[416,578,421],{},[416,580,424],{},[426,582,583,593,603,613],{},[413,584,585,590],{},[431,586,587],{},[323,588,589],{},"Add Hypothesis",[431,591,592],{},"基于领域知识补充你的假设（例如“DBA 上周调整了索引”）",[413,594,595,600],{},[431,596,597],{},[323,598,599],{},"Verify Hypothesis",[431,601,602],{},"让 AI 为某个假设收集更多证据",[413,604,605,610],{},[431,606,607],{},[323,608,609],{},"Confirm Hypothesis",[431,611,612],{},"将该假设标记为已确认根因",[413,614,615,620],{},[431,616,617],{},[323,618,619],{},"Reject Hypothesis",[431,621,622],{},"将该假设从候选范围中排除",[286,624,625],{},"每个假设都会附带支撑证据，包括日志、指标、代码 diff 或事件，你可以逐条查看。",[286,627,628],{},[323,629,630],{},"通过聊天补充指导",[286,632,633],{},"你也可以直接用自然语言引导调查：",[635,636,642],"pre",{"className":637,"code":639,"language":640,"meta":641},[638],"language-text","检查一下 order-service 最近的部署，尤其是事务逻辑相关的改动\n","text","",[643,644,639],"code",{"__ignoreMap":641},[635,646,649],{"className":647,"code":648,"language":640,"meta":641},[638],"重点看一下 3:15 左右的数据库锁问题\n",[643,650,648],{"__ignoreMap":641},[286,652,653],{},"Castrel 会结合你的指导，把它的全局数据扫描能力聚焦到你指定的方向上。",[290,655,657],{"id":656},"castrel-如何调查事故","Castrel 如何调查事故",[286,659,660],{},"Castrel 会按照一套系统化流程执行根因分析：",[286,662,663],{},[323,664,665],{},"1. 数据采集",[286,667,668],{},"Castrel 会在指定时间范围内从已连接的数据源收集信息：",[330,670,671,674,677,680],{},[333,672,673],{},"Kubernetes 事件和 Pod 日志",[333,675,676],{},"应用指标和链路",[333,678,679],{},"数据库性能数据",[333,681,682],{},"部署与配置变更历史",[286,684,685],{},[323,686,687],{},"2. 假设生成",[286,689,690],{},"基于收集到的数据，Castrel 会通过以下方式生成假设：",[330,692,693,696,699,702],{},[333,694,695],{},"识别指标异常（延迟激增、错误率上升等）",[333,697,698],{},"将部署、配置更新等变更与事故发生时间做关联",[333,700,701],{},"分析错误日志和堆栈信息",[333,703,704],{},"识别资源饱和模式",[286,706,707],{},[323,708,709],{},"3. 传播分析",[286,711,712],{},"Castrel 会建立传播模型：",[330,714,715,718,721],{},[333,716,717],{},"追踪服务依赖",[333,719,720],{},"识别故障起点",[333,722,723],{},"映射故障如何沿着架构传播",[286,725,726],{},[323,727,728],{},"4. 证据汇总",[286,730,731],{},"对于每个假设，Castrel 会汇总支撑证据：",[330,733,734,737,740,743],{},[333,735,736],{},"带时间戳的相关日志",[333,738,739],{},"展示异常的指标图表",[333,741,742],{},"最近变更中的代码 diff",[333,744,745],{},"与相似历史事故的关联",[290,747,748],{"id":748},"提升效果的小建议",[407,750,751,760],{},[410,752,753],{},[413,754,755,758],{},[416,756,757],{},"建议",[416,759,424],{},[426,761,762,772,786,796],{},[413,763,764,769],{},[431,765,766],{},[323,767,768],{},"接入全部数据源",[431,770,771],{},"指标、日志、链路和变更管理接得越完整，根因识别越准确",[413,773,774,779],{},[431,775,776],{},[323,777,778],{},"使用知识库",[431,780,781,782,785],{},"在 ",[298,783,784],{"href":41},"知识"," 中记录预期行为和 runbook，帮助 Castrel 更好理解你的系统",[413,787,788,793],{},[431,789,790],{},[323,791,792],{},"提供业务上下文",[431,794,795],{},"AI 擅长扫数据，你擅长业务背景，二者结合效果最好",[413,797,798,803],{},[431,799,800],{},[323,801,802],{},"查看全部证据",[431,804,805],{},"在确认某个假设之前，先把支撑证据看完整",[290,807,808],{"id":808},"常见问题",[810,811,813],"collapsible",{"name":812},"一次调查通常要多久？","初始分析通常会在 2 到 5 分钟内完成，具体取决于数据源范围和问题复杂度。分析过程中你可以边看边继续交互，不必等全部结束。",[810,815,817],{"name":816},"可以调查历史事故吗？","可以。使用自定义时间范围即可分析历史事故，适合做复盘和模式分析。",[810,819,821],{"name":820},"如果 AI 找不到根因怎么办？","这在复杂场景里很正常。你可以使用 “Get Report” 获取一份结构化摘要，了解哪些方向已经被排除、哪些方向还没有覆盖。很多时候，人的领域知识和 AI 的系统化分析结合起来，效果会比单独依赖其中一方更好。",[810,823,825,827],{"name":824},"它和告警分诊是什么关系？",[298,826,24],{"href":25}," 负责对进入系统的告警做分类并识别事故。一旦告警被确认为事故，事故调查就会接手，继续做深入根因分析。两者一起构成端到端的事故响应链路。",[810,829,831],{"name":830},"多位团队成员可以一起协作调查吗？","可以。调查状态会在 Web 控制台和 Slack 之间同步，所以多个团队成员可以同时查看结果、添加假设并提供指导。",{"title":641,"searchDepth":833,"depth":833,"links":834},2,[835,836,837,838,839,840],{"id":292,"depth":833,"text":293},{"id":319,"depth":833,"text":319},{"id":470,"depth":833,"text":470},{"id":656,"depth":833,"text":657},{"id":748,"depth":833,"text":748},{"id":808,"depth":833,"text":808},"通过自动化根因分析与人机协作，更快完成事故调查。","md",null,{},true,{"title":28,"description":841},"k3JWcQueScCdWvcBJPLuouKFcXa1ERRbCKgGZIZuz7U",[849,851],{"title":24,"path":25,"stem":26,"description":850,"children":-1},"使用 AI 洞察高效完成告警分诊与管理。",{"title":32,"path":33,"stem":34,"description":852,"children":-1},"通过分析日志、代码变更并生成可执行修复建议，自动诊断部署失败。",1777027428584]