index.ts 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. /**
  2. * Pure-TypeScript port of vendor/file-index-src (Rust NAPI module).
  3. *
  4. * The native module wraps nucleo (https://github.com/helix-editor/nucleo) for
  5. * high-performance fuzzy file searching. This port reimplements the same API
  6. * and scoring behavior without native dependencies.
  7. *
  8. * Key API:
  9. * new FileIndex()
  10. * .loadFromFileList(fileList: string[]): void — dedupe + index paths
  11. * .search(query: string, limit: number): SearchResult[]
  12. *
  13. * Score semantics: lower = better. Score is position-in-results / result-count,
  14. * so the best match is 0.0. Paths containing "test" get a 1.05× penalty (capped
  15. * at 1.0) so non-test files rank slightly higher.
  16. */
  17. export type SearchResult = {
  18. path: string
  19. score: number
  20. }
  21. // nucleo-style scoring constants (approximating fzf-v2 / nucleo bonuses)
  22. const SCORE_MATCH = 16
  23. const BONUS_BOUNDARY = 8
  24. const BONUS_CAMEL = 6
  25. const BONUS_CONSECUTIVE = 4
  26. const BONUS_FIRST_CHAR = 8
  27. const PENALTY_GAP_START = 3
  28. const PENALTY_GAP_EXTENSION = 1
  29. const TOP_LEVEL_CACHE_LIMIT = 100
  30. const MAX_QUERY_LEN = 64
  31. // Yield to event loop after this many ms of sync work. Chunk sizes are
  32. // time-based (not count-based) so slow machines get smaller chunks and
  33. // stay responsive — 5k paths is ~2ms on M-series but could be 15ms+ on
  34. // older Windows hardware.
  35. const CHUNK_MS = 4
  36. // Reusable buffer: records where each needle char matched during the indexOf scan
  37. const posBuf = new Int32Array(MAX_QUERY_LEN)
  38. export class FileIndex {
  39. private paths: string[] = []
  40. private lowerPaths: string[] = []
  41. private charBits: Int32Array = new Int32Array(0)
  42. private pathLens: Uint16Array = new Uint16Array(0)
  43. private topLevelCache: SearchResult[] | null = null
  44. // During async build, tracks how many paths have bitmap/lowerPath filled.
  45. // search() uses this to search the ready prefix while build continues.
  46. private readyCount = 0
  47. /**
  48. * Load paths from an array of strings.
  49. * This is the main way to populate the index — ripgrep collects files, we just search them.
  50. * Automatically deduplicates paths.
  51. */
  52. loadFromFileList(fileList: string[]): void {
  53. // Deduplicate and filter empty strings (matches Rust HashSet behavior)
  54. const seen = new Set<string>()
  55. const paths: string[] = []
  56. for (const line of fileList) {
  57. if (line.length > 0 && !seen.has(line)) {
  58. seen.add(line)
  59. paths.push(line)
  60. }
  61. }
  62. this.buildIndex(paths)
  63. }
  64. /**
  65. * Async variant: yields to the event loop every ~8–12k paths so large
  66. * indexes (270k+ files) don't block the main thread for >10ms at a time.
  67. * Identical result to loadFromFileList.
  68. *
  69. * Returns { queryable, done }:
  70. * - queryable: resolves as soon as the first chunk is indexed (search
  71. * returns partial results). For a 270k-path list this is ~5–10ms of
  72. * sync work after the paths array is available.
  73. * - done: resolves when the entire index is built.
  74. */
  75. loadFromFileListAsync(fileList: string[]): {
  76. queryable: Promise<void>
  77. done: Promise<void>
  78. } {
  79. let markQueryable: () => void = () => {}
  80. const queryable = new Promise<void>(resolve => {
  81. markQueryable = resolve
  82. })
  83. const done = this.buildAsync(fileList, markQueryable)
  84. return { queryable, done }
  85. }
  86. private async buildAsync(
  87. fileList: string[],
  88. markQueryable: () => void,
  89. ): Promise<void> {
  90. const seen = new Set<string>()
  91. const paths: string[] = []
  92. let chunkStart = performance.now()
  93. for (let i = 0; i < fileList.length; i++) {
  94. const line = fileList[i]!
  95. if (line.length > 0 && !seen.has(line)) {
  96. seen.add(line)
  97. paths.push(line)
  98. }
  99. // Check every 256 iterations to amortize performance.now() overhead
  100. if ((i & 0xff) === 0xff && performance.now() - chunkStart > CHUNK_MS) {
  101. await yieldToEventLoop()
  102. chunkStart = performance.now()
  103. }
  104. }
  105. this.resetArrays(paths)
  106. chunkStart = performance.now()
  107. let firstChunk = true
  108. for (let i = 0; i < paths.length; i++) {
  109. this.indexPath(i)
  110. if ((i & 0xff) === 0xff && performance.now() - chunkStart > CHUNK_MS) {
  111. this.readyCount = i + 1
  112. if (firstChunk) {
  113. markQueryable()
  114. firstChunk = false
  115. }
  116. await yieldToEventLoop()
  117. chunkStart = performance.now()
  118. }
  119. }
  120. this.readyCount = paths.length
  121. markQueryable()
  122. }
  123. private buildIndex(paths: string[]): void {
  124. this.resetArrays(paths)
  125. for (let i = 0; i < paths.length; i++) {
  126. this.indexPath(i)
  127. }
  128. this.readyCount = paths.length
  129. }
  130. private resetArrays(paths: string[]): void {
  131. const n = paths.length
  132. this.paths = paths
  133. this.lowerPaths = new Array(n)
  134. this.charBits = new Int32Array(n)
  135. this.pathLens = new Uint16Array(n)
  136. this.readyCount = 0
  137. this.topLevelCache = computeTopLevelEntries(paths, TOP_LEVEL_CACHE_LIMIT)
  138. }
  139. // Precompute: lowercase, a–z bitmap, length. Bitmap gives O(1) rejection
  140. // of paths missing any needle letter (89% survival for broad queries like
  141. // "test" → still a 10%+ free win; 90%+ rejection for rare chars).
  142. private indexPath(i: number): void {
  143. const lp = this.paths[i]!.toLowerCase()
  144. this.lowerPaths[i] = lp
  145. const len = lp.length
  146. this.pathLens[i] = len
  147. let bits = 0
  148. for (let j = 0; j < len; j++) {
  149. const c = lp.charCodeAt(j)
  150. if (c >= 97 && c <= 122) bits |= 1 << (c - 97)
  151. }
  152. this.charBits[i] = bits
  153. }
  154. /**
  155. * Search for files matching the query using fuzzy matching.
  156. * Returns top N results sorted by match score.
  157. */
  158. search(query: string, limit: number): SearchResult[] {
  159. if (limit <= 0) return []
  160. if (query.length === 0) {
  161. if (this.topLevelCache) {
  162. return this.topLevelCache.slice(0, limit)
  163. }
  164. return []
  165. }
  166. // Smart case: lowercase query → case-insensitive; any uppercase → case-sensitive
  167. const caseSensitive = query !== query.toLowerCase()
  168. const needle = caseSensitive ? query : query.toLowerCase()
  169. const nLen = Math.min(needle.length, MAX_QUERY_LEN)
  170. const needleChars: string[] = new Array(nLen)
  171. let needleBitmap = 0
  172. for (let j = 0; j < nLen; j++) {
  173. const ch = needle.charAt(j)
  174. needleChars[j] = ch
  175. const cc = ch.charCodeAt(0)
  176. if (cc >= 97 && cc <= 122) needleBitmap |= 1 << (cc - 97)
  177. }
  178. // Upper bound on score assuming every match gets the max boundary bonus.
  179. // Used to reject paths whose gap penalties alone make them unable to beat
  180. // the current top-k threshold, before the charCodeAt-heavy boundary pass.
  181. const scoreCeiling =
  182. nLen * (SCORE_MATCH + BONUS_BOUNDARY) + BONUS_FIRST_CHAR + 32
  183. // Top-k: maintain a sorted-ascending array of the best `limit` matches.
  184. // Avoids O(n log n) sort of all matches when we only need `limit` of them.
  185. const topK: { path: string; fuzzScore: number }[] = []
  186. let threshold = -Infinity
  187. const { paths, lowerPaths, charBits, pathLens, readyCount } = this
  188. outer: for (let i = 0; i < readyCount; i++) {
  189. // O(1) bitmap reject: path must contain every letter in the needle
  190. if ((charBits[i]! & needleBitmap) !== needleBitmap) continue
  191. const haystack = caseSensitive ? paths[i]! : lowerPaths[i]!
  192. // Greedy-leftmost indexOf gives fast but suboptimal positions when the
  193. // first needle char appears early (e.g. 's' in "src/") while the real
  194. // match lives deeper (e.g. "settings/"). We score from multiple start
  195. // positions — the leftmost hit plus every word-boundary occurrence of
  196. // needle[0] — and keep the best. Typical paths have 2–4 boundary starts,
  197. // so the overhead is minimal.
  198. // Collect candidate start positions for needle[0]
  199. const firstChar = needleChars[0]!
  200. let startCount = 0
  201. // startPositions is stack-allocated (reused array would add complexity
  202. // for marginal gain; paths rarely have >8 boundary starts)
  203. const startPositions: number[] = []
  204. // Always try the leftmost occurrence
  205. const firstPos = haystack.indexOf(firstChar)
  206. if (firstPos === -1) continue
  207. startPositions[startCount++] = firstPos
  208. // Also try every word-boundary position where needle[0] occurs
  209. for (let bp = firstPos + 1; bp < haystack.length; bp++) {
  210. if (haystack.charCodeAt(bp) !== firstChar.charCodeAt(0)) continue
  211. // Check if this position is at a word boundary
  212. const prevCode = haystack.charCodeAt(bp - 1)
  213. if (
  214. prevCode === 47 || // /
  215. prevCode === 92 || // \
  216. prevCode === 45 || // -
  217. prevCode === 95 || // _
  218. prevCode === 46 || // .
  219. prevCode === 32 // space
  220. ) {
  221. startPositions[startCount++] = bp
  222. }
  223. }
  224. const originalPath = paths[i]!
  225. const hLen = pathLens[i]!
  226. const lengthBonus = Math.max(0, 32 - (hLen >> 2))
  227. let bestScore = -Infinity
  228. for (let si = 0; si < startCount; si++) {
  229. posBuf[0] = startPositions[si]!
  230. let gapPenalty = 0
  231. let consecBonus = 0
  232. let prev = posBuf[0]!
  233. let matched = true
  234. for (let j = 1; j < nLen; j++) {
  235. const pos = haystack.indexOf(needleChars[j]!, prev + 1)
  236. if (pos === -1) { matched = false; break }
  237. posBuf[j] = pos
  238. const gap = pos - prev - 1
  239. if (gap === 0) consecBonus += BONUS_CONSECUTIVE
  240. else gapPenalty += PENALTY_GAP_START + gap * PENALTY_GAP_EXTENSION
  241. prev = pos
  242. }
  243. if (!matched) continue
  244. // Gap-bound reject for this start position
  245. if (
  246. topK.length === limit &&
  247. scoreCeiling + consecBonus - gapPenalty + lengthBonus <= threshold
  248. ) {
  249. continue
  250. }
  251. // Boundary/camelCase scoring
  252. let score = nLen * SCORE_MATCH + consecBonus - gapPenalty
  253. score += scoreBonusAt(originalPath, posBuf[0]!, true)
  254. for (let j = 1; j < nLen; j++) {
  255. score += scoreBonusAt(originalPath, posBuf[j]!, false)
  256. }
  257. score += lengthBonus
  258. if (score > bestScore) bestScore = score
  259. }
  260. if (bestScore === -Infinity) continue
  261. const score = bestScore
  262. if (topK.length < limit) {
  263. topK.push({ path: originalPath, fuzzScore: score })
  264. if (topK.length === limit) {
  265. topK.sort((a, b) => a.fuzzScore - b.fuzzScore)
  266. threshold = topK[0]!.fuzzScore
  267. }
  268. } else if (score > threshold) {
  269. let lo = 0
  270. let hi = topK.length
  271. while (lo < hi) {
  272. const mid = (lo + hi) >> 1
  273. if (topK[mid]!.fuzzScore < score) lo = mid + 1
  274. else hi = mid
  275. }
  276. topK.splice(lo, 0, { path: originalPath, fuzzScore: score })
  277. topK.shift()
  278. threshold = topK[0]!.fuzzScore
  279. }
  280. }
  281. // topK is ascending; reverse to descending (best first)
  282. topK.sort((a, b) => b.fuzzScore - a.fuzzScore)
  283. const matchCount = topK.length
  284. const denom = Math.max(matchCount, 1)
  285. const results: SearchResult[] = new Array(matchCount)
  286. for (let i = 0; i < matchCount; i++) {
  287. const path = topK[i]!.path
  288. const positionScore = i / denom
  289. const finalScore = path.includes('test')
  290. ? Math.min(positionScore * 1.05, 1.0)
  291. : positionScore
  292. results[i] = { path, score: finalScore }
  293. }
  294. return results
  295. }
  296. }
  297. /**
  298. * Boundary/camelCase bonus for a match at position `pos` in the original-case
  299. * path. `first` enables the start-of-string bonus (only for needle[0]).
  300. */
  301. function scoreBonusAt(path: string, pos: number, first: boolean): number {
  302. if (pos === 0) return first ? BONUS_FIRST_CHAR : 0
  303. const prevCh = path.charCodeAt(pos - 1)
  304. if (isBoundary(prevCh)) return BONUS_BOUNDARY
  305. if (isLower(prevCh) && isUpper(path.charCodeAt(pos))) return BONUS_CAMEL
  306. return 0
  307. }
  308. function isBoundary(code: number): boolean {
  309. // / \ - _ . space
  310. return (
  311. code === 47 || // /
  312. code === 92 || // \
  313. code === 45 || // -
  314. code === 95 || // _
  315. code === 46 || // .
  316. code === 32 // space
  317. )
  318. }
  319. function isLower(code: number): boolean {
  320. return code >= 97 && code <= 122
  321. }
  322. function isUpper(code: number): boolean {
  323. return code >= 65 && code <= 90
  324. }
  325. export function yieldToEventLoop(): Promise<void> {
  326. return new Promise(resolve => setImmediate(resolve))
  327. }
  328. export { CHUNK_MS }
  329. /**
  330. * Extract unique top-level path segments, sorted by (length asc, then alpha asc).
  331. * Handles both Unix (/) and Windows (\) path separators.
  332. * Mirrors FileIndex::compute_top_level_entries in lib.rs.
  333. */
  334. function computeTopLevelEntries(
  335. paths: string[],
  336. limit: number,
  337. ): SearchResult[] {
  338. const topLevel = new Set<string>()
  339. for (const p of paths) {
  340. // Split on first / or \ separator
  341. let end = p.length
  342. for (let i = 0; i < p.length; i++) {
  343. const c = p.charCodeAt(i)
  344. if (c === 47 || c === 92) {
  345. end = i
  346. break
  347. }
  348. }
  349. const segment = p.slice(0, end)
  350. if (segment.length > 0) {
  351. topLevel.add(segment)
  352. if (topLevel.size >= limit) break
  353. }
  354. }
  355. const sorted = Array.from(topLevel)
  356. sorted.sort((a, b) => {
  357. const lenDiff = a.length - b.length
  358. if (lenDiff !== 0) return lenDiff
  359. return a < b ? -1 : a > b ? 1 : 0
  360. })
  361. return sorted.slice(0, limit).map(path => ({ path, score: 0.0 }))
  362. }
  363. export default FileIndex
  364. export type { FileIndex as FileIndexType }