| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135 |
- import { logForDebugging } from '../utils/debug.js'
- import { BridgeFatalError } from './bridgeApi.js'
- import type { BridgeApiClient } from './types.js'
- /**
- * Ant-only fault injection for manually testing bridge recovery paths.
- *
- * Real failure modes this targets (BQ 2026-03-12, 7-day window):
- * poll 404 not_found_error — 147K sessions/week, dead onEnvironmentLost gate
- * ws_closed 1002/1006 — 22K sessions/week, zombie poll after close
- * register transient failure — residual: network blips during doReconnect
- *
- * Usage: /bridge-kick <subcommand> from the REPL while Remote Control is
- * connected, then tail debug.log to watch the recovery machinery react.
- *
- * Module-level state is intentional here: one bridge per REPL process, the
- * /bridge-kick slash command has no other way to reach into initBridgeCore's
- * closures, and teardown clears the slot.
- */
- /** One-shot fault to inject on the next matching api call. */
- type BridgeFault = {
- method:
- | 'pollForWork'
- | 'registerBridgeEnvironment'
- | 'reconnectSession'
- | 'heartbeatWork'
- /** Fatal errors go through handleErrorStatus → BridgeFatalError. Transient
- * errors surface as plain axios rejections (5xx / network). Recovery code
- * distinguishes the two: fatal → teardown, transient → retry/backoff. */
- kind: 'fatal' | 'transient'
- status: number
- errorType?: string
- /** Remaining injections. Decremented on consume; removed at 0. */
- count: number
- }
- export type BridgeDebugHandle = {
- /** Invoke the transport's permanent-close handler directly. Tests the
- * ws_closed → reconnectEnvironmentWithSession escalation (#22148). */
- fireClose: (code: number) => void
- /** Call reconnectEnvironmentWithSession() — same as SIGUSR2 but
- * reachable from the slash command. */
- forceReconnect: () => void
- /** Queue a fault for the next N calls to the named api method. */
- injectFault: (fault: BridgeFault) => void
- /** Abort the at-capacity sleep so an injected poll fault lands
- * immediately instead of up to 10min later. */
- wakePollLoop: () => void
- /** env/session IDs for the debug.log grep. */
- describe: () => string
- }
- let debugHandle: BridgeDebugHandle | null = null
- const faultQueue: BridgeFault[] = []
- export function registerBridgeDebugHandle(h: BridgeDebugHandle): void {
- debugHandle = h
- }
- export function clearBridgeDebugHandle(): void {
- debugHandle = null
- faultQueue.length = 0
- }
- export function getBridgeDebugHandle(): BridgeDebugHandle | null {
- return debugHandle
- }
- export function injectBridgeFault(fault: BridgeFault): void {
- faultQueue.push(fault)
- logForDebugging(
- `[bridge:debug] Queued fault: ${fault.method} ${fault.kind}/${fault.status}${fault.errorType ? `/${fault.errorType}` : ''} ×${fault.count}`,
- )
- }
- /**
- * Wrap a BridgeApiClient so each call first checks the fault queue. If a
- * matching fault is queued, throw the specified error instead of calling
- * through. Delegates everything else to the real client.
- *
- * Only called when USER_TYPE === 'ant' — zero overhead in external builds.
- */
- export function wrapApiForFaultInjection(
- api: BridgeApiClient,
- ): BridgeApiClient {
- function consume(method: BridgeFault['method']): BridgeFault | null {
- const idx = faultQueue.findIndex(f => f.method === method)
- if (idx === -1) return null
- const fault = faultQueue[idx]!
- fault.count--
- if (fault.count <= 0) faultQueue.splice(idx, 1)
- return fault
- }
- function throwFault(fault: BridgeFault, context: string): never {
- logForDebugging(
- `[bridge:debug] Injecting ${fault.kind} fault into ${context}: status=${fault.status} errorType=${fault.errorType ?? 'none'}`,
- )
- if (fault.kind === 'fatal') {
- throw new BridgeFatalError(
- `[injected] ${context} ${fault.status}`,
- fault.status,
- fault.errorType,
- )
- }
- // Transient: mimic an axios rejection (5xx / network). No .status on
- // the error itself — that's how the catch blocks distinguish.
- throw new Error(`[injected transient] ${context} ${fault.status}`)
- }
- return {
- ...api,
- async pollForWork(envId, secret, signal, reclaimMs) {
- const f = consume('pollForWork')
- if (f) throwFault(f, 'Poll')
- return api.pollForWork(envId, secret, signal, reclaimMs)
- },
- async registerBridgeEnvironment(config) {
- const f = consume('registerBridgeEnvironment')
- if (f) throwFault(f, 'Registration')
- return api.registerBridgeEnvironment(config)
- },
- async reconnectSession(envId, sessionId) {
- const f = consume('reconnectSession')
- if (f) throwFault(f, 'ReconnectSession')
- return api.reconnectSession(envId, sessionId)
- },
- async heartbeatWork(envId, workId, token) {
- const f = consume('heartbeatWork')
- if (f) throwFault(f, 'Heartbeat')
- return api.heartbeatWork(envId, workId, token)
- },
- }
- }
|