tokenEstimation.ts 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. import type { Anthropic } from '@anthropic-ai/sdk'
  2. import type { BetaMessageParam as MessageParam } from '@anthropic-ai/sdk/resources/beta/messages/messages.mjs'
  3. // @aws-sdk/client-bedrock-runtime is imported dynamically in countTokensWithBedrock()
  4. // to defer ~279KB of AWS SDK code until a Bedrock call is actually made
  5. import type { CountTokensCommandInput } from '@aws-sdk/client-bedrock-runtime'
  6. import { getAPIProvider } from 'src/utils/model/providers.js'
  7. import { VERTEX_COUNT_TOKENS_ALLOWED_BETAS } from '../constants/betas.js'
  8. import type { Attachment } from '../utils/attachments.js'
  9. import { getModelBetas } from '../utils/betas.js'
  10. import { getVertexRegionForModel, isEnvTruthy } from '../utils/envUtils.js'
  11. import { logError } from '../utils/log.js'
  12. import { normalizeAttachmentForAPI } from '../utils/messages.js'
  13. import {
  14. createBedrockRuntimeClient,
  15. getInferenceProfileBackingModel,
  16. isFoundationModel,
  17. } from '../utils/model/bedrock.js'
  18. import {
  19. getDefaultSonnetModel,
  20. getMainLoopModel,
  21. getSmallFastModel,
  22. normalizeModelStringForAPI,
  23. } from '../utils/model/model.js'
  24. import { jsonStringify } from '../utils/slowOperations.js'
  25. import { isToolReferenceBlock } from '../utils/toolSearch.js'
  26. import { getAPIMetadata, getExtraBodyParams } from './api/claude.js'
  27. import { getAnthropicClient } from './api/client.js'
  28. import { withTokenCountVCR } from './vcr.js'
  29. // Minimal values for token counting with thinking enabled
  30. // API constraint: max_tokens must be greater than thinking.budget_tokens
  31. const TOKEN_COUNT_THINKING_BUDGET = 1024
  32. const TOKEN_COUNT_MAX_TOKENS = 2048
  33. /**
  34. * Check if messages contain thinking blocks
  35. */
  36. function hasThinkingBlocks(
  37. messages: Anthropic.Beta.Messages.BetaMessageParam[],
  38. ): boolean {
  39. for (const message of messages) {
  40. if (message.role === 'assistant' && Array.isArray(message.content)) {
  41. for (const block of message.content) {
  42. if (
  43. typeof block === 'object' &&
  44. block !== null &&
  45. 'type' in block &&
  46. (block.type === 'thinking' || block.type === 'redacted_thinking')
  47. ) {
  48. return true
  49. }
  50. }
  51. }
  52. }
  53. return false
  54. }
  55. /**
  56. * Strip tool search-specific fields from messages before sending for token counting.
  57. * This removes 'caller' from tool_use blocks and 'tool_reference' from tool_result content.
  58. * These fields are only valid with the tool search beta and will cause errors otherwise.
  59. *
  60. * Note: We use 'as unknown as' casts because the SDK types don't include tool search beta fields,
  61. * but at runtime these fields may exist from API responses when tool search was enabled.
  62. */
  63. function stripToolSearchFieldsFromMessages(
  64. messages: Anthropic.Beta.Messages.BetaMessageParam[],
  65. ): Anthropic.Beta.Messages.BetaMessageParam[] {
  66. return messages.map(message => {
  67. if (!Array.isArray(message.content)) {
  68. return message
  69. }
  70. const normalizedContent = message.content.map(block => {
  71. // Strip 'caller' from tool_use blocks (assistant messages)
  72. if (block.type === 'tool_use') {
  73. // Destructure to exclude any extra fields like 'caller'
  74. const toolUse =
  75. block as Anthropic.Beta.Messages.BetaToolUseBlockParam & {
  76. caller?: unknown
  77. }
  78. return {
  79. type: 'tool_use' as const,
  80. id: toolUse.id,
  81. name: toolUse.name,
  82. input: toolUse.input,
  83. }
  84. }
  85. // Strip tool_reference blocks from tool_result content (user messages)
  86. if (block.type === 'tool_result') {
  87. const toolResult =
  88. block as Anthropic.Beta.Messages.BetaToolResultBlockParam
  89. if (Array.isArray(toolResult.content)) {
  90. const filteredContent = (toolResult.content as unknown[]).filter(
  91. c => !isToolReferenceBlock(c),
  92. ) as typeof toolResult.content
  93. if (filteredContent.length === 0) {
  94. return {
  95. ...toolResult,
  96. content: [{ type: 'text' as const, text: '[tool references]' }],
  97. }
  98. }
  99. if (filteredContent.length !== toolResult.content.length) {
  100. return {
  101. ...toolResult,
  102. content: filteredContent,
  103. }
  104. }
  105. }
  106. }
  107. return block
  108. })
  109. return {
  110. ...message,
  111. content: normalizedContent,
  112. }
  113. })
  114. }
  115. export async function countTokensWithAPI(
  116. content: string,
  117. ): Promise<number | null> {
  118. // Special case for empty content - API doesn't accept empty messages
  119. if (!content) {
  120. return 0
  121. }
  122. const message: Anthropic.Beta.Messages.BetaMessageParam = {
  123. role: 'user',
  124. content: content,
  125. }
  126. return countMessagesTokensWithAPI([message], [])
  127. }
  128. export async function countMessagesTokensWithAPI(
  129. messages: Anthropic.Beta.Messages.BetaMessageParam[],
  130. tools: Anthropic.Beta.Messages.BetaToolUnion[],
  131. ): Promise<number | null> {
  132. return withTokenCountVCR(messages, tools, async () => {
  133. try {
  134. const model = getMainLoopModel()
  135. const betas = getModelBetas(model)
  136. const containsThinking = hasThinkingBlocks(messages)
  137. if (getAPIProvider() === 'bedrock') {
  138. // @anthropic-sdk/bedrock-sdk doesn't support countTokens currently
  139. return countTokensWithBedrock({
  140. model: normalizeModelStringForAPI(model),
  141. messages,
  142. tools,
  143. betas,
  144. containsThinking,
  145. })
  146. }
  147. const anthropic = await getAnthropicClient({
  148. maxRetries: 1,
  149. model,
  150. source: 'count_tokens',
  151. })
  152. const filteredBetas =
  153. getAPIProvider() === 'vertex'
  154. ? betas.filter(b => VERTEX_COUNT_TOKENS_ALLOWED_BETAS.has(b))
  155. : betas
  156. const response = await anthropic.beta.messages.countTokens({
  157. model: normalizeModelStringForAPI(model),
  158. messages:
  159. // When we pass tools and no messages, we need to pass a dummy message
  160. // to get an accurate tool token count.
  161. messages.length > 0 ? messages : [{ role: 'user', content: 'foo' }],
  162. tools,
  163. ...(filteredBetas.length > 0 && { betas: filteredBetas }),
  164. // Enable thinking if messages contain thinking blocks
  165. ...(containsThinking && {
  166. thinking: {
  167. type: 'enabled',
  168. budget_tokens: TOKEN_COUNT_THINKING_BUDGET,
  169. },
  170. }),
  171. })
  172. if (typeof response.input_tokens !== 'number') {
  173. // Vertex client throws
  174. // Bedrock client succeeds with { Output: { __type: 'com.amazon.coral.service#UnknownOperationException' }, Version: '1.0' }
  175. return null
  176. }
  177. return response.input_tokens
  178. } catch (error) {
  179. logError(error)
  180. return null
  181. }
  182. })
  183. }
  184. export function roughTokenCountEstimation(
  185. content: string,
  186. bytesPerToken: number = 4,
  187. ): number {
  188. return Math.round(content.length / bytesPerToken)
  189. }
  190. /**
  191. * Returns an estimated bytes-per-token ratio for a given file extension.
  192. * Dense JSON has many single-character tokens (`{`, `}`, `:`, `,`, `"`)
  193. * which makes the real ratio closer to 2 rather than the default 4.
  194. */
  195. export function bytesPerTokenForFileType(fileExtension: string): number {
  196. switch (fileExtension) {
  197. case 'json':
  198. case 'jsonl':
  199. case 'jsonc':
  200. return 2
  201. default:
  202. return 4
  203. }
  204. }
  205. /**
  206. * Like {@link roughTokenCountEstimation} but uses a more accurate
  207. * bytes-per-token ratio when the file type is known.
  208. *
  209. * This matters when the API-based token count is unavailable (e.g. on
  210. * Bedrock) and we fall back to the rough estimate — an underestimate can
  211. * let an oversized tool result slip into the conversation.
  212. */
  213. export function roughTokenCountEstimationForFileType(
  214. content: string,
  215. fileExtension: string,
  216. ): number {
  217. return roughTokenCountEstimation(
  218. content,
  219. bytesPerTokenForFileType(fileExtension),
  220. )
  221. }
  222. /**
  223. * Estimates token count for a Message object by extracting and analyzing its text content.
  224. * This provides a more reliable estimate than getTokenUsage for messages that may have been compacted.
  225. * Uses Haiku for token counting (Haiku 4.5 supports thinking blocks), except:
  226. * - Vertex global region: uses Sonnet (Haiku not available)
  227. * - Bedrock with thinking blocks: uses Sonnet (Haiku 3.5 doesn't support thinking)
  228. */
  229. export async function countTokensViaHaikuFallback(
  230. messages: Anthropic.Beta.Messages.BetaMessageParam[],
  231. tools: Anthropic.Beta.Messages.BetaToolUnion[],
  232. ): Promise<number | null> {
  233. // Check if messages contain thinking blocks
  234. const containsThinking = hasThinkingBlocks(messages)
  235. // If we're on Vertex and using global region, always use Sonnet since Haiku is not available there.
  236. const isVertexGlobalEndpoint =
  237. isEnvTruthy(process.env.CLAUDE_CODE_USE_VERTEX) &&
  238. getVertexRegionForModel(getSmallFastModel()) === 'global'
  239. // If we're on Bedrock with thinking blocks, use Sonnet since Haiku 3.5 doesn't support thinking
  240. const isBedrockWithThinking =
  241. isEnvTruthy(process.env.CLAUDE_CODE_USE_BEDROCK) && containsThinking
  242. // If we're on Vertex with thinking blocks, use Sonnet since Haiku 3.5 doesn't support thinking
  243. const isVertexWithThinking =
  244. isEnvTruthy(process.env.CLAUDE_CODE_USE_VERTEX) && containsThinking
  245. // Otherwise always use Haiku - Haiku 4.5 supports thinking blocks.
  246. // WARNING: if you change this to use a non-Haiku model, this request will fail in 1P unless it uses getCLISyspromptPrefix.
  247. // Note: We don't need Sonnet for tool_reference blocks because we strip them via
  248. // stripToolSearchFieldsFromMessages() before sending.
  249. // Use getSmallFastModel() to respect ANTHROPIC_SMALL_FAST_MODEL env var for Bedrock users
  250. // with global inference profiles (see issue #10883).
  251. const model =
  252. isVertexGlobalEndpoint || isBedrockWithThinking || isVertexWithThinking
  253. ? getDefaultSonnetModel()
  254. : getSmallFastModel()
  255. const anthropic = await getAnthropicClient({
  256. maxRetries: 1,
  257. model,
  258. source: 'count_tokens',
  259. })
  260. // Strip tool search-specific fields (caller, tool_reference) before sending
  261. // These fields are only valid with the tool search beta header
  262. const normalizedMessages = stripToolSearchFieldsFromMessages(messages)
  263. const messagesToSend: MessageParam[] =
  264. normalizedMessages.length > 0
  265. ? (normalizedMessages as MessageParam[])
  266. : [{ role: 'user', content: 'count' }]
  267. const betas = getModelBetas(model)
  268. // Filter betas for Vertex - some betas (like web-search) cause 400 errors
  269. // on certain Vertex endpoints. See issue #10789.
  270. const filteredBetas =
  271. getAPIProvider() === 'vertex'
  272. ? betas.filter(b => VERTEX_COUNT_TOKENS_ALLOWED_BETAS.has(b))
  273. : betas
  274. // biome-ignore lint/plugin: token counting needs specialized parameters (thinking, betas) that sideQuery doesn't support
  275. const response = await anthropic.beta.messages.create({
  276. model: normalizeModelStringForAPI(model),
  277. max_tokens: containsThinking ? TOKEN_COUNT_MAX_TOKENS : 1,
  278. messages: messagesToSend,
  279. tools: tools.length > 0 ? tools : undefined,
  280. ...(filteredBetas.length > 0 && { betas: filteredBetas }),
  281. metadata: getAPIMetadata(),
  282. ...getExtraBodyParams(),
  283. // Enable thinking if messages contain thinking blocks
  284. ...(containsThinking && {
  285. thinking: {
  286. type: 'enabled',
  287. budget_tokens: TOKEN_COUNT_THINKING_BUDGET,
  288. },
  289. }),
  290. })
  291. const usage = response.usage
  292. const inputTokens = usage.input_tokens
  293. const cacheCreationTokens = usage.cache_creation_input_tokens || 0
  294. const cacheReadTokens = usage.cache_read_input_tokens || 0
  295. return inputTokens + cacheCreationTokens + cacheReadTokens
  296. }
  297. export function roughTokenCountEstimationForMessages(
  298. messages: readonly {
  299. type: string
  300. message?: { content?: unknown }
  301. attachment?: Attachment
  302. }[],
  303. ): number {
  304. let totalTokens = 0
  305. for (const message of messages) {
  306. totalTokens += roughTokenCountEstimationForMessage(message)
  307. }
  308. return totalTokens
  309. }
  310. export function roughTokenCountEstimationForMessage(message: {
  311. type: string
  312. message?: { content?: unknown }
  313. attachment?: Attachment
  314. }): number {
  315. if (
  316. (message.type === 'assistant' || message.type === 'user') &&
  317. message.message?.content
  318. ) {
  319. return roughTokenCountEstimationForContent(
  320. message.message?.content as
  321. | string
  322. | Array<Anthropic.ContentBlock>
  323. | Array<Anthropic.ContentBlockParam>
  324. | undefined,
  325. )
  326. }
  327. if (message.type === 'attachment' && message.attachment) {
  328. const userMessages = normalizeAttachmentForAPI(message.attachment)
  329. let total = 0
  330. for (const userMsg of userMessages) {
  331. total += roughTokenCountEstimationForContent(userMsg.message.content)
  332. }
  333. return total
  334. }
  335. return 0
  336. }
  337. function roughTokenCountEstimationForContent(
  338. content:
  339. | string
  340. | Array<Anthropic.ContentBlock>
  341. | Array<Anthropic.ContentBlockParam>
  342. | undefined,
  343. ): number {
  344. if (!content) {
  345. return 0
  346. }
  347. if (typeof content === 'string') {
  348. return roughTokenCountEstimation(content)
  349. }
  350. let totalTokens = 0
  351. for (const block of content) {
  352. totalTokens += roughTokenCountEstimationForBlock(block)
  353. }
  354. return totalTokens
  355. }
  356. function roughTokenCountEstimationForBlock(
  357. block: string | Anthropic.ContentBlock | Anthropic.ContentBlockParam,
  358. ): number {
  359. if (typeof block === 'string') {
  360. return roughTokenCountEstimation(block)
  361. }
  362. if (block.type === 'text') {
  363. return roughTokenCountEstimation(block.text)
  364. }
  365. if (block.type === 'image' || block.type === 'document') {
  366. // https://platform.claude.com/docs/en/build-with-claude/vision#calculate-image-costs
  367. // tokens = (width px * height px)/750
  368. // Images are resized to max 2000x2000 (5333 tokens). Use a conservative
  369. // estimate that matches microCompact's IMAGE_MAX_TOKEN_SIZE to avoid
  370. // underestimating and triggering auto-compact too late.
  371. //
  372. // document: base64 PDF in source.data. Must NOT reach the
  373. // jsonStringify catch-all — a 1MB PDF is ~1.33M base64 chars →
  374. // ~325k estimated tokens, vs the ~2000 the API actually charges.
  375. // Same constant as microCompact's calculateToolResultTokens.
  376. return 2000
  377. }
  378. if (block.type === 'tool_result') {
  379. return roughTokenCountEstimationForContent(block.content)
  380. }
  381. if (block.type === 'tool_use') {
  382. // input is the JSON the model generated — arbitrarily large (bash
  383. // commands, Edit diffs, file contents). Stringify once for the
  384. // char count; the API re-serializes anyway so this is what it sees.
  385. return roughTokenCountEstimation(
  386. block.name + jsonStringify(block.input ?? {}),
  387. )
  388. }
  389. if (block.type === 'thinking') {
  390. return roughTokenCountEstimation(block.thinking)
  391. }
  392. if (block.type === 'redacted_thinking') {
  393. return roughTokenCountEstimation(block.data)
  394. }
  395. // server_tool_use, web_search_tool_result, mcp_tool_use, etc. —
  396. // text-like payloads (tool inputs, search results, no base64).
  397. // Stringify-length tracks the serialized form the API sees; the
  398. // key/bracket overhead is single-digit percent on real blocks.
  399. return roughTokenCountEstimation(jsonStringify(block))
  400. }
  401. async function countTokensWithBedrock({
  402. model,
  403. messages,
  404. tools,
  405. betas,
  406. containsThinking,
  407. }: {
  408. model: string
  409. messages: Anthropic.Beta.Messages.BetaMessageParam[]
  410. tools: Anthropic.Beta.Messages.BetaToolUnion[]
  411. betas: string[]
  412. containsThinking: boolean
  413. }): Promise<number | null> {
  414. try {
  415. const client = await createBedrockRuntimeClient()
  416. // Bedrock CountTokens requires a model ID, not an inference profile / ARN
  417. const modelId = isFoundationModel(model)
  418. ? model
  419. : await getInferenceProfileBackingModel(model)
  420. if (!modelId) {
  421. return null
  422. }
  423. const requestBody = {
  424. anthropic_version: 'bedrock-2023-05-31',
  425. // When we pass tools and no messages, we need to pass a dummy message
  426. // to get an accurate tool token count.
  427. messages:
  428. messages.length > 0 ? messages : [{ role: 'user', content: 'foo' }],
  429. max_tokens: containsThinking ? TOKEN_COUNT_MAX_TOKENS : 1,
  430. ...(tools.length > 0 && { tools }),
  431. ...(betas.length > 0 && { anthropic_beta: betas }),
  432. ...(containsThinking && {
  433. thinking: {
  434. type: 'enabled',
  435. budget_tokens: TOKEN_COUNT_THINKING_BUDGET,
  436. },
  437. }),
  438. }
  439. const { CountTokensCommand } = await import(
  440. '@aws-sdk/client-bedrock-runtime'
  441. )
  442. const input: CountTokensCommandInput = {
  443. modelId,
  444. input: {
  445. invokeModel: {
  446. body: new TextEncoder().encode(jsonStringify(requestBody)),
  447. },
  448. },
  449. }
  450. const response = await client.send(new CountTokensCommand(input))
  451. const tokenCount = response.inputTokens ?? null
  452. return tokenCount
  453. } catch (error) {
  454. logError(error)
  455. return null
  456. }
  457. }