| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189 |
- import { writeFile } from 'fs/promises'
- import { join } from 'path'
- import {
- type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
- logEvent,
- } from '../services/analytics/index.js'
- import type { MCPResultType } from '../services/mcp/client.js'
- import { toError } from './errors.js'
- import { formatFileSize } from './format.js'
- import { logError } from './log.js'
- import { ensureToolResultsDir, getToolResultsDir } from './toolResultStorage.js'
- /**
- * Generates a format description string based on the MCP result type and schema.
- */
- export function getFormatDescription(
- type: MCPResultType,
- schema?: unknown,
- ): string {
- switch (type) {
- case 'toolResult':
- return 'Plain text'
- case 'structuredContent':
- return schema ? `JSON with schema: ${schema}` : 'JSON'
- case 'contentArray':
- return schema ? `JSON array with schema: ${schema}` : 'JSON array'
- }
- }
- /**
- * Generates instruction text for Claude to read from a saved output file.
- *
- * @param rawOutputPath - Path to the saved output file
- * @param contentLength - Length of the content in characters
- * @param formatDescription - Description of the content format
- * @param maxReadLength - Optional max chars for Read tool (for Bash output context)
- * @returns Instruction text to include in the tool result
- */
- export function getLargeOutputInstructions(
- rawOutputPath: string,
- contentLength: number,
- formatDescription: string,
- maxReadLength?: number,
- ): string {
- const baseInstructions =
- `Error: result (${contentLength.toLocaleString()} characters) exceeds maximum allowed tokens. Output has been saved to ${rawOutputPath}.\n` +
- `Format: ${formatDescription}\n` +
- `Use offset and limit parameters to read specific portions of the file, search within it for specific content, and jq to make structured queries.\n` +
- `REQUIREMENTS FOR SUMMARIZATION/ANALYSIS/REVIEW:\n` +
- `- You MUST read the content from the file at ${rawOutputPath} in sequential chunks until 100% of the content has been read.\n`
- const truncationWarning = maxReadLength
- ? `- If you receive truncation warnings when reading the file ("[N lines truncated]"), reduce the chunk size until you have read 100% of the content without truncation ***DO NOT PROCEED UNTIL YOU HAVE DONE THIS***. Bash output is limited to ${maxReadLength.toLocaleString()} chars.\n`
- : `- If you receive truncation warnings when reading the file, reduce the chunk size until you have read 100% of the content without truncation.\n`
- const completionRequirement = `- Before producing ANY summary or analysis, you MUST explicitly describe what portion of the content you have read. ***If you did not read the entire content, you MUST explicitly state this.***\n`
- return baseInstructions + truncationWarning + completionRequirement
- }
- /**
- * Map a mime type to a file extension. Conservative: known types get their
- * proper extension; unknown types get 'bin'. The extension matters because
- * the Read tool dispatches on it (PDFs, images, etc. need the right ext).
- */
- export function extensionForMimeType(mimeType: string | undefined): string {
- if (!mimeType) return 'bin'
- // Strip any charset/boundary parameter
- const mt = (mimeType.split(';')[0] ?? '').trim().toLowerCase()
- switch (mt) {
- case 'application/pdf':
- return 'pdf'
- case 'application/json':
- return 'json'
- case 'text/csv':
- return 'csv'
- case 'text/plain':
- return 'txt'
- case 'text/html':
- return 'html'
- case 'text/markdown':
- return 'md'
- case 'application/zip':
- return 'zip'
- case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
- return 'docx'
- case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
- return 'xlsx'
- case 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
- return 'pptx'
- case 'application/msword':
- return 'doc'
- case 'application/vnd.ms-excel':
- return 'xls'
- case 'audio/mpeg':
- return 'mp3'
- case 'audio/wav':
- return 'wav'
- case 'audio/ogg':
- return 'ogg'
- case 'video/mp4':
- return 'mp4'
- case 'video/webm':
- return 'webm'
- case 'image/png':
- return 'png'
- case 'image/jpeg':
- return 'jpg'
- case 'image/gif':
- return 'gif'
- case 'image/webp':
- return 'webp'
- case 'image/svg+xml':
- return 'svg'
- default:
- return 'bin'
- }
- }
- /**
- * Heuristic for whether a content-type header indicates binary content that
- * should be saved to disk rather than put into the model context.
- * Text-ish types (text/*, json, xml, form data) are treated as non-binary.
- */
- export function isBinaryContentType(contentType: string): boolean {
- if (!contentType) return false
- const mt = (contentType.split(';')[0] ?? '').trim().toLowerCase()
- if (mt.startsWith('text/')) return false
- // Structured text formats delivered with an application/ type. Use suffix
- // or exact match rather than substring so 'openxmlformats' (docx/xlsx) stays binary.
- if (mt.endsWith('+json') || mt === 'application/json') return false
- if (mt.endsWith('+xml') || mt === 'application/xml') return false
- if (mt.startsWith('application/javascript')) return false
- if (mt === 'application/x-www-form-urlencoded') return false
- return true
- }
- export type PersistBinaryResult =
- | { filepath: string; size: number; ext: string }
- | { error: string }
- /**
- * Write raw binary bytes to the tool-results directory with a mime-derived
- * extension. Unlike persistToolResult (which stringifies), this writes the
- * bytes as-is so the resulting file can be opened with native tools (Read
- * for PDFs, pandas for xlsx, etc.).
- */
- export async function persistBinaryContent(
- bytes: Buffer,
- mimeType: string | undefined,
- persistId: string,
- ): Promise<PersistBinaryResult> {
- await ensureToolResultsDir()
- const ext = extensionForMimeType(mimeType)
- const filepath = join(getToolResultsDir(), `${persistId}.${ext}`)
- try {
- await writeFile(filepath, bytes)
- } catch (error) {
- const err = toError(error)
- logError(err)
- return { error: err.message }
- }
- // mime type and extension are safe fixed-vocabulary strings (not paths/code)
- logEvent('tengu_binary_content_persisted', {
- mimeType: (mimeType ??
- 'unknown') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
- sizeBytes: bytes.length,
- ext: ext as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
- })
- return { filepath, size: bytes.length, ext }
- }
- /**
- * Build a short message telling Claude where binary content was saved.
- * Just states the path — no prescriptive hint, since what the model can
- * actually do with the file depends on provider/tooling.
- */
- export function getBinaryBlobSavedMessage(
- filepath: string,
- mimeType: string | undefined,
- size: number,
- sourceDescription: string,
- ): string {
- const mt = mimeType || 'unknown type'
- return `${sourceDescription}Binary content (${mt}, ${formatFileSize(size)}) saved to ${filepath}`
- }
|