bridgeDebug.ts 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. import { logForDebugging } from '../utils/debug.js'
  2. import { BridgeFatalError } from './bridgeApi.js'
  3. import type { BridgeApiClient } from './types.js'
  4. /**
  5. * Ant-only fault injection for manually testing bridge recovery paths.
  6. *
  7. * Real failure modes this targets (BQ 2026-03-12, 7-day window):
  8. * poll 404 not_found_error — 147K sessions/week, dead onEnvironmentLost gate
  9. * ws_closed 1002/1006 — 22K sessions/week, zombie poll after close
  10. * register transient failure — residual: network blips during doReconnect
  11. *
  12. * Usage: /bridge-kick <subcommand> from the REPL while Remote Control is
  13. * connected, then tail debug.log to watch the recovery machinery react.
  14. *
  15. * Module-level state is intentional here: one bridge per REPL process, the
  16. * /bridge-kick slash command has no other way to reach into initBridgeCore's
  17. * closures, and teardown clears the slot.
  18. */
  19. /** One-shot fault to inject on the next matching api call. */
  20. type BridgeFault = {
  21. method:
  22. | 'pollForWork'
  23. | 'registerBridgeEnvironment'
  24. | 'reconnectSession'
  25. | 'heartbeatWork'
  26. /** Fatal errors go through handleErrorStatus → BridgeFatalError. Transient
  27. * errors surface as plain axios rejections (5xx / network). Recovery code
  28. * distinguishes the two: fatal → teardown, transient → retry/backoff. */
  29. kind: 'fatal' | 'transient'
  30. status: number
  31. errorType?: string
  32. /** Remaining injections. Decremented on consume; removed at 0. */
  33. count: number
  34. }
  35. export type BridgeDebugHandle = {
  36. /** Invoke the transport's permanent-close handler directly. Tests the
  37. * ws_closed → reconnectEnvironmentWithSession escalation (#22148). */
  38. fireClose: (code: number) => void
  39. /** Call reconnectEnvironmentWithSession() — same as SIGUSR2 but
  40. * reachable from the slash command. */
  41. forceReconnect: () => void
  42. /** Queue a fault for the next N calls to the named api method. */
  43. injectFault: (fault: BridgeFault) => void
  44. /** Abort the at-capacity sleep so an injected poll fault lands
  45. * immediately instead of up to 10min later. */
  46. wakePollLoop: () => void
  47. /** env/session IDs for the debug.log grep. */
  48. describe: () => string
  49. }
  50. let debugHandle: BridgeDebugHandle | null = null
  51. const faultQueue: BridgeFault[] = []
  52. export function registerBridgeDebugHandle(h: BridgeDebugHandle): void {
  53. debugHandle = h
  54. }
  55. export function clearBridgeDebugHandle(): void {
  56. debugHandle = null
  57. faultQueue.length = 0
  58. }
  59. export function getBridgeDebugHandle(): BridgeDebugHandle | null {
  60. return debugHandle
  61. }
  62. export function injectBridgeFault(fault: BridgeFault): void {
  63. faultQueue.push(fault)
  64. logForDebugging(
  65. `[bridge:debug] Queued fault: ${fault.method} ${fault.kind}/${fault.status}${fault.errorType ? `/${fault.errorType}` : ''} ×${fault.count}`,
  66. )
  67. }
  68. /**
  69. * Wrap a BridgeApiClient so each call first checks the fault queue. If a
  70. * matching fault is queued, throw the specified error instead of calling
  71. * through. Delegates everything else to the real client.
  72. *
  73. * Only called when USER_TYPE === 'ant' — zero overhead in external builds.
  74. */
  75. export function wrapApiForFaultInjection(
  76. api: BridgeApiClient,
  77. ): BridgeApiClient {
  78. function consume(method: BridgeFault['method']): BridgeFault | null {
  79. const idx = faultQueue.findIndex(f => f.method === method)
  80. if (idx === -1) return null
  81. const fault = faultQueue[idx]!
  82. fault.count--
  83. if (fault.count <= 0) faultQueue.splice(idx, 1)
  84. return fault
  85. }
  86. function throwFault(fault: BridgeFault, context: string): never {
  87. logForDebugging(
  88. `[bridge:debug] Injecting ${fault.kind} fault into ${context}: status=${fault.status} errorType=${fault.errorType ?? 'none'}`,
  89. )
  90. if (fault.kind === 'fatal') {
  91. throw new BridgeFatalError(
  92. `[injected] ${context} ${fault.status}`,
  93. fault.status,
  94. fault.errorType,
  95. )
  96. }
  97. // Transient: mimic an axios rejection (5xx / network). No .status on
  98. // the error itself — that's how the catch blocks distinguish.
  99. throw new Error(`[injected transient] ${context} ${fault.status}`)
  100. }
  101. return {
  102. ...api,
  103. async pollForWork(envId, secret, signal, reclaimMs) {
  104. const f = consume('pollForWork')
  105. if (f) throwFault(f, 'Poll')
  106. return api.pollForWork(envId, secret, signal, reclaimMs)
  107. },
  108. async registerBridgeEnvironment(config) {
  109. const f = consume('registerBridgeEnvironment')
  110. if (f) throwFault(f, 'Registration')
  111. return api.registerBridgeEnvironment(config)
  112. },
  113. async reconnectSession(envId, sessionId) {
  114. const f = consume('reconnectSession')
  115. if (f) throwFault(f, 'ReconnectSession')
  116. return api.reconnectSession(envId, sessionId)
  117. },
  118. async heartbeatWork(envId, workId, token) {
  119. const f = consume('heartbeatWork')
  120. if (f) throwFault(f, 'Heartbeat')
  121. return api.heartbeatWork(envId, workId, token)
  122. },
  123. }
  124. }