streetsidesoftware · Jul 16, 2024
diff --git a/‎packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.test.ts
+4-4 b/‎packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.test.ts
+4-4
diff --git a/‎packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.ts
+25-61 b/‎packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.ts
+25-61
diff --git a/‎packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.ts
+1-1 b/‎packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.ts
+1-1
diff --git a/‎packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobBuilder.ts
-33 b/‎packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobBuilder.ts
-33
diff --git a/‎packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobInternals.ts
+5-2 b/‎packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobInternals.ts
+5-2
diff --git a/‎packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlob.ts
+2-2 b/‎packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlob.ts
+2-2
diff --git a/‎packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.perf.ts
+164-1 b/‎packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.perf.ts
+164-1
diff --git a/‎packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.test.ts
+134 b/‎packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.test.ts
+134
diff --git a/‎packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.ts
+89-14 b/‎packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.ts
+89-14
diff --git a/‎packages/cspell-trie-lib/src/perf/charIndex.perf.ts
+5-4 b/‎packages/cspell-trie-lib/src/perf/charIndex.perf.ts
+5-4
diff --git a/‎vitest.config.mjs
+2 b/‎vitest.config.mjs
+2
@@ -9,15 +9,15 @@ describe('CharIndexBuilder', () => {
         const letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'];
         const indexes = letters.map((c) => charIndexBuilder.getUtf8Value(c));
         expect(indexes).toEqual(letters.map((c) => c.codePointAt(0)));
-        const r = charIndexBuilder.wordToUtf8Seq('abcdefghij');
-        expect(r).toEqual([...textEncoder.encode('abcdefghij')]);
-        expect(charIndexBuilder.size).toBe(11); // One extra for the empty string.
+        const r = charIndexBuilder.wordToUtf8Seq('abcdefghij⚁⚂⚃⚄⚀');
+        expect(r).toEqual([...textEncoder.encode('abcdefghij⚁⚂⚃⚄⚀')]);
+        expect(charIndexBuilder.size).toBe(16); // One extra for the empty string.
 
         // Add the same letters again.
         expect(letters.map((c) => charIndexBuilder.getUtf8Value(c))).toEqual(letters.map((c) => c.codePointAt(0)));
 
         const charIndex = charIndexBuilder.build();
-        expect(charIndex.size).toBe(11);
+        expect(charIndex.size).toBe(16);
         expect(charIndex.wordToUtf8Seq('abcdefghij')).toEqual([...textEncoder.encode('abcdefghij')]);
     });
 });
@@ -1,12 +1,12 @@
-import { encodeUtf8N_BE, type Utf8BE32 } from './Utf8.js';
+import { encodeTextToUtf8, encodeUtf8N_BE, type Utf8BE32 } from './Utf8.js';
 
 export type Utf8Seq = Readonly<number[]>;
 
 export type CharIndexMap = Record<string, Utf8BE32>;
 
 export type RO_CharIndexMap = Readonly<CharIndexMap>;
 
-export type CharIndexSeqMap = Record<string, Utf8Seq | number>;
+export type CharIndexSeqMap = Record<string, Utf8Seq>;
 
 export type RO_CharIndexSeqMap = Readonly<CharIndexSeqMap>;
 
@@ -15,52 +15,29 @@ const emptySeq: Utf8Seq = [0];
 Object.freeze(emptySeq);
 
 export class CharIndex {
-    readonly charToUtf8Map: RO_CharIndexMap;
-    readonly charToUtf8SeqMap: RO_CharIndexSeqMap;
+    #charToUtf8SeqMap: CharIndexSeqMap;
 
     #lastWord = '';
     #lastWordSeq: Utf8Seq = [];
+    #multiByteChars: boolean;
 
     constructor(readonly charIndex: readonly string[]) {
-        this.charToUtf8Map = buildCharIndexMap(charIndex);
-        this.charToUtf8SeqMap = buildCharIndexSequenceMap(this.charToUtf8Map);
-    }
-
-    getUtf8Value(c: string): number {
-        return this.charToUtf8Map[c] || 0;
+        this.#charToUtf8SeqMap = buildCharIndexSequenceMap(charIndex);
+        this.#multiByteChars = Object.values(this.#charToUtf8SeqMap).some((c) => c.length > 1);
     }
 
     getCharUtf8Seq(c: string): Utf8Seq {
-        const r = this.charToUtf8SeqMap[c] ?? emptySeq;
-        return typeof r === 'number' ? [r] : r;
-    }
-
-    __wordToUtf8Seq(word: string): Utf8Seq {
-        // Note: Array.flatMap is very slow
-        const seq: number[] = new Array(word.length);
-        let i = 0;
-        for (const c of word) {
-            const cSep = this.charToUtf8SeqMap[c];
-            if (typeof cSep === 'number') {
-                seq[i++] = cSep;
-                continue;
-            }
-            if (!cSep) {
-                seq[i++] = 0;
-                continue;
-            }
-            for (const cIdx of cSep) {
-                seq[i++] = cIdx;
-            }
-        }
-        if (seq.length !== i) seq.length = i;
-        return seq;
+        const found = this.#charToUtf8SeqMap[c];
+        if (found) return found;
+        const s = encodeTextToUtf8(c);
+        this.#charToUtf8SeqMap[c] = s;
+        return s;
     }
 
     wordToUtf8Seq(word: string): Utf8Seq {
         if (this.#lastWord === word) return this.#lastWordSeq;
 
-        const seq = this.__wordToUtf8Seq(word);
+        const seq = encodeTextToUtf8(word);
 
         this.#lastWord = word;
         this.#lastWordSeq = seq;
@@ -69,7 +46,7 @@ export class CharIndex {
     }
 
     indexContainsMultiByteChars(): boolean {
-        return Object.values(this.charToUtf8Map).some((v) => v >= 0x80);
+        return this.#multiByteChars;
     }
 
     get size(): number {
@@ -81,22 +58,10 @@ export class CharIndex {
     }
 }
 
-function buildCharIndexMap(charIndex: readonly string[]): CharIndexMap {
-    const map: CharIndexMap = Object.create(null);
-    for (const c of charIndex) {
-        const cn = c.normalize('NFC');
-        const utf8 = encodeUtf8N_BE(cn.codePointAt(0) || 0);
-        map[c] = utf8;
-        map[c.normalize('NFC')] = utf8;
-        map[c.normalize('NFD')] = utf8;
-    }
-    return map;
-}
-
-function buildCharIndexSequenceMap(charIndexMap: RO_CharIndexMap): CharIndexSeqMap {
+function buildCharIndexSequenceMap(charIndex: readonly string[]): CharIndexSeqMap {
     const map: CharIndexSeqMap = Object.create(null);
-    for (const [key, value] of Object.entries(charIndexMap)) {
-        map[key] = splitUtf8IfNeeded(value);
+    for (const key of charIndex) {
+        map[key] = encodeTextToUtf8(key);
     }
     return map;
 }
@@ -106,7 +71,7 @@ export class CharIndexBuilder {
     readonly charIndexMap: CharIndexMap = Object.create(null);
     readonly charIndexSeqMap: CharIndexSeqMap = Object.create(null);
 
-    readonly #mapIdxToSeq = new Map<number, number[] | number>();
+    readonly #mapIdxToSeq = new Map<number, number[]>();
 
     constructor() {
         this.getUtf8Value('');
@@ -126,24 +91,22 @@ export class CharIndexBuilder {
         return utf8;
     }
 
-    utf8ValueToUtf8Seq(idx: number): number[] | number {
+    utf8ValueToUtf8Seq(idx: number): number[] {
         const found = this.#mapIdxToSeq.get(idx);
         if (found !== undefined) {
             return found;
         }
-        const seq = splitUtf8IfNeeded(idx);
+        const seq = splitUtf8(idx);
         this.#mapIdxToSeq.set(idx, seq);
         return seq;
     }
 
     charToUtf8Seq(c: string): number[] {
         const idx = this.getUtf8Value(c);
-        const s = this.utf8ValueToUtf8Seq(idx);
-        return typeof s === 'number' ? [s] : s;
+        return this.utf8ValueToUtf8Seq(idx);
     }
 
     wordToUtf8Seq(word: string): number[] {
-        // word = word.normalize('NFC');
         const seq: number[] = new Array(word.length);
         let i = 0;
         for (const c of word) {
@@ -170,8 +133,9 @@ export class CharIndexBuilder {
     }
 }
 
-function splitUtf8IfNeeded(utf8: number): number | number[] {
-    if (utf8 < 0x80) return utf8;
-    const s = [(utf8 >> 24) & 0xff, (utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff].filter((v) => v);
-    return s.length ? s : s[0];
+function splitUtf8(utf8: number): number[] {
+    if (utf8 <= 0xff) return [utf8];
+    if (utf8 <= 0xffff) return [(utf8 >> 8) & 0xff, utf8 & 0xff];
+    if (utf8 <= 0xff_ffff) return [(utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff];
+    return [(utf8 >> 24) & 0xff, (utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff].filter((v) => v);
 }
@@ -214,7 +214,7 @@ export class FastTrieBlob implements TrieData {
 
     static toITrieNodeRoot(trie: FastTrieBlob): ITrieNodeRoot {
         return new FastTrieBlobIRoot(
-            new FastTrieBlobInternals(trie.nodes, trie._charIndex, trie.bitMasksInfo, trie.sorted),
+            new FastTrieBlobInternals(trie.nodes, trie._charIndex, trie.bitMasksInfo),
             0,
             trie.info,
         );
 
@@ -111,7 +111,6 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
             for (let i = 0; i < utf8Seq.length; ++i) {
                 insertCharIndexes(utf8Seq[i], pDepth);
             }
-            // dumpState({ step: 'insertChar', char });
         };
 
         /**
@@ -174,8 +173,6 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
             const pos = s.pos;
             const node = nodes[nodeIdx];
             node[pos] = (refNodeIdx << NodeChildRefShift) | (node[pos] & LetterMask);
-
-            // dumpState({ step: 'reference', refId, refNodeIdx });
         };
 
         const backStep = (num: number) => {
@@ -186,38 +183,8 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
                 depth = stack[depth].pDepth;
             }
             nodeIdx = stack[depth + 1].nodeIdx;
-
-            // dumpState({ step: 'backStep', num });
         };
 
-        // function dumpNode(node: number[]): string {
-        //     const n = node
-        //         .map((n, i) => {
-        //             if (!i) return `w: ${(n & NodeMaskEOW && 1) || 0}`;
-        //             return `{ c: ${(n & LetterMask).toString(16).padStart(2, '0')}, r: ${n >>> NodeChildRefShift} }`;
-        //         })
-        //         .join(', ');
-        //     return `[${n}]`;
-        // }
-
-        // function dumpNodes(nodes: FastTrieBlobNode[]) {
-        //     return nodes.map((n, i) => `${i}: ${dumpNode(n)}`);
-        // }
-
-        // const debug = false;
-
-        // function dumpState(extra?: Record<string, unknown>) {
-        //     debug &&
-        //         console.warn('%o', {
-        //             stack: stack.slice(0, depth + 1),
-        //             nodes: dumpNodes(nodes),
-        //             nodeIdx,
-        //             depth,
-        //             refNodes,
-        //             ...extra,
-        //         });
-        // }
-
         const c: BuilderCursor = {
             insertChar,
             markEOW,
 
@@ -12,14 +12,13 @@ export class FastTrieBlobInternals implements FastTrieBlobBitMaskInfo {
         readonly nodes: number[][],
         readonly charIndex: CharIndex,
         maskInfo: FastTrieBlobBitMaskInfo,
-        sorted = false,
     ) {
         const { NodeMaskEOW, NodeMaskChildCharIndex, NodeChildRefShift } = maskInfo;
         this.NodeMaskEOW = NodeMaskEOW;
         this.NodeMaskChildCharIndex = NodeMaskChildCharIndex;
         this.NodeChildRefShift = NodeChildRefShift;
         this.isIndexDecoderNeeded = charIndex.indexContainsMultiByteChars();
-        !sorted && sortNodes(nodes, this.NodeMaskChildCharIndex);
+        sortNodes(nodes, this.NodeMaskChildCharIndex);
     }
 }
 
@@ -30,6 +29,10 @@ export class FastTrieBlobInternals implements FastTrieBlobBitMaskInfo {
  * @returns
  */
 export function sortNodes(nodes: number[][], mask: number): number[][] {
+    if (Object.isFrozen(nodes)) {
+        assertSorted(nodes, mask);
+        return nodes;
+    }
     for (let i = 0; i < nodes.length; ++i) {
         let node = nodes[i];
         if (node.length > 2) {
 
@@ -82,7 +82,7 @@ export class TrieBlob implements TrieData {
         this.#nonStrictIdx = this._lookupNode(0, this.info.stripCaseAndAccentsPrefix);
     }
 
-    public wordToNodeCharIndexSequence(word: string): Utf8Seq {
+    public wordToUtf8Seq(word: string): Utf8Seq {
         return this.charIndex.wordToUtf8Seq(word);
     }
 
@@ -159,7 +159,7 @@ export class TrieBlob implements TrieData {
         const NodeChildRefShift = TrieBlob.NodeChildRefShift;
         const nodes = this.nodes;
         const nodes8 = this.#nodes8;
-        const wordIndexes = this.wordToNodeCharIndexSequence(word);
+        const wordIndexes = this.wordToUtf8Seq(word);
         const lookup = this.#nodeIdxLookup;
         const len = wordIndexes.length;
         let p = 0;
 
@@ -1,11 +1,24 @@
 import { suite } from 'perf-insight';
 
-import { decodeUtf8ByteStream, decodeUtf8N_BE, decodeUtf8N_LE, encodeUtf8N_BE, encodeUtf8N_LE } from './Utf8.js';
+import {
+    decodeUtf8ByteStream,
+    decodeUtf8N_BE,
+    decodeUtf8N_LE,
+    encodeCodePointsToUtf8Into,
+    encodeTextToUtf8,
+    encodeTextToUtf8Into,
+    encodeUtf8N_BE,
+    encodeUtf8N_LE,
+    textToCodePoints,
+} from './Utf8.js';
 
 suite('Utf8 encode/decode', async (test) => {
     const iterations = 1000;
     const text = sampleText();
+    const words = text.split(/\s+/).filter((a) => !!a);
+    const wordsCP = words.map((word) => [...word].map((char) => char.codePointAt(0) || 0));
     const chars = [...text];
+    const codePoints = chars.map((char) => char.codePointAt(0) || 0);
     const encoder = new TextEncoder();
     const decoder = new TextDecoder();
     const scratchBuffer = new Uint8Array(1024);
@@ -29,6 +42,16 @@ suite('Utf8 encode/decode', async (test) => {
         }
     });
 
+    test('TextEncoder.encodeInto by char', () => {
+        const buffer = new Uint8Array(scratchBuffer.buffer, 0, 4);
+        for (let i = iterations; i > 0; --i) {
+            for (const char of chars) {
+                buffer[0] = 0;
+                encoder.encodeInto(char, buffer);
+            }
+        }
+    });
+
     test('encodeUtf8N_BE', () => {
         for (let i = iterations; i > 0; --i) {
             for (const char of chars) {
@@ -50,6 +73,146 @@ suite('Utf8 encode/decode', async (test) => {
             }
         }
     });
+
+    test('TextEncoder.encodeInto text', () => {
+        const buffer = scratchBuffer;
+        const _text = text;
+        for (let i = iterations; i > 0; --i) {
+            encoder.encodeInto(_text, buffer);
+        }
+    });
+
+    test('Buffer.write text', () => {
+        const buffer = Buffer.from(scratchBuffer.buffer);
+        // const _text = text;
+        for (let i = iterations; i > 0; --i) {
+            buffer.write(text, 'utf16le');
+        }
+    });
+
+    test('encodeCodePointsInto', () => {
+        const buffer = scratchBuffer;
+        const points = codePoints;
+        for (let i = iterations; i > 0; --i) {
+            encodeCodePointsToUtf8Into(points, buffer);
+        }
+    });
+
+    test(`TextEncoder.encodeInto words (${words.length})`, () => {
+        const buffer = scratchBuffer;
+        const _words = words;
+        for (let i = iterations; i > 0; --i) {
+            for (const word of _words) {
+                encoder.encodeInto(word, buffer);
+            }
+        }
+    });
+
+    test(`encodeCodePointsInto wordsCP (${words.length})`, () => {
+        const buffer = scratchBuffer;
+        const words = wordsCP;
+        for (let i = iterations; i > 0; --i) {
+            for (const points of words) {
+                encodeCodePointsToUtf8Into(points, buffer);
+            }
+        }
+    });
+
+    test(`encodeCodePointsInto Array wordsCP (${words.length})`, () => {
+        const buffer = new Array(100);
+        const words = wordsCP;
+        for (let i = iterations; i > 0; --i) {
+            for (const points of words) {
+                encodeCodePointsToUtf8Into(points, buffer);
+            }
+        }
+    });
+
+    test(`encodeCodePointsInto wordsCP .codePointAt (${words.length})`, () => {
+        const buffer = scratchBuffer;
+        const _words = words;
+        for (let i = iterations; i > 0; --i) {
+            for (const word of _words) {
+                encodeCodePointsToUtf8Into(
+                    [...word].map((a) => a.codePointAt(0) || 0),
+                    buffer,
+                );
+            }
+        }
+    });
+
+    test(`encodeTextToUtf8Into Uint8Array words (${words.length})`, () => {
+        const buffer = scratchBuffer;
+        const _words = words;
+        for (let i = iterations; i > 0; --i) {
+            for (const word of _words) {
+                encodeTextToUtf8Into(word, buffer);
+            }
+        }
+    });
+
+    test(`encodeTextToUtf8Into array words (${words.length})`, () => {
+        const buffer = new Array(100);
+        const _words = words;
+        for (let i = iterations; i > 0; --i) {
+            for (const word of _words) {
+                encodeTextToUtf8Into(word, buffer);
+            }
+        }
+    });
+
+    test(`encoder.encode(word) to array words (${words.length})`, () => {
+        const _words = words;
+        for (let i = iterations; i > 0; --i) {
+            for (const word of _words) {
+                [...encoder.encode(word)];
+            }
+        }
+    });
+
+    test(`encodeTextToUtf8 array words (${words.length})`, () => {
+        const _words = words;
+        for (let i = iterations; i > 0; --i) {
+            for (const word of _words) {
+                encodeTextToUtf8(word);
+            }
+        }
+    });
+
+    const charToUtf8Map = new Map<string, number[]>(
+        [...new Set([...sampleText()])].map((char) => [char, encodeTextToUtf8(char)] as const),
+    );
+
+    test(`encodeTextToUtf8 to array with lookup (${words.length})`, () => {
+        const _words = words;
+        for (let i = iterations; i > 0; --i) {
+            for (const word of _words) {
+                const a: number[] = new Array(word.length * 2);
+                let i = 0;
+                for (const c of word) {
+                    const u8 = charToUtf8Map.get(c);
+                    for (const u of u8 || []) {
+                        a[i++] = u;
+                    }
+                }
+                a.length = i;
+            }
+        }
+    });
+
+    test('textToCodePoints', () => {
+        const _text = text;
+        for (let i = iterations; i > 0; --i) {
+            textToCodePoints(_text);
+        }
+    });
+
+    test('textToCodePoints map', () => {
+        const _text = text;
+        for (let i = iterations; i > 0; --i) {
+            [..._text].map((a) => a.codePointAt(0) || 0);
+        }
+    });
 });
 
 suite('Utf8 decode buffer', async (test) => {
 
@@ -6,9 +6,12 @@ import {
     decodeUtf8ByteStream,
     decodeUtf8N_BE,
     decodeUtf8N_LE,
+    encodeCodePointsToUtf8Into,
+    encodeTextToUtf8,
     encodeUtf8N_BE,
     encodeUtf8N_LE,
     hex32,
+    textToCodePoints,
     Utf8Accumulator,
 } from './Utf8.js';
 
@@ -43,6 +46,77 @@ describe('Utf8 lib', () => {
         }
     });
 
+    test.each`
+        text    | expected
+        ${'a'}  | ${[0x61]}
+        ${'ab'} | ${[0x61, 0x62]}
+        ${'é'}  | ${[195, 169]}
+        ${'🇺🇸'} | ${[240, 159, 135, 186, 240, 159, 135, 184]}
+    `('encodeTextToUtf8 $text', ({ text, expected }) => {
+        expect(encodeTextToUtf8(text)).toEqual(expected);
+        expect(encodeTextToUtf8(text)).toEqual([...encoder.encode(text)]);
+
+        const scratch: number[] = [];
+        const len = encodeCodePointsToUtf8Into(textToCodePoints(text), scratch);
+        expect(scratch.slice(0, len)).toEqual(expected);
+    });
+
+    test('encodeCodePointsToUtf8Into', () => {
+        const decoder = new TextDecoder();
+        const text = sampleText();
+        const scratch: number[] = [];
+        const len = encodeCodePointsToUtf8Into(textToCodePoints(text), scratch);
+        const buf = new Uint8Array(scratch.slice(0, len));
+        expect(decoder.decode(buf)).toBe(text);
+    });
+
+    test.each`
+        text    | expected
+        ${'a'}  | ${[0x61]}
+        ${'ab'} | ${[0x61, 0x62]}
+        ${'é'}  | ${[0xc3a9]}
+        ${'🇺🇸'} | ${[0xf09f_87ba, 0xf09f_87b8]}
+    `('encodeUtf8N_BE $text', ({ text, expected }) => {
+        const utf = textToCodePoints(text).map((cp) => encodeUtf8N_BE(cp));
+        expect(utf).toEqual(expected);
+        expect(
+            String.fromCodePoint(
+                ...utf
+                    .map((v) => v ^ ~1) // force it to be native
+                    .map((v) => v ^ ~1)
+                    .map((c) => decodeUtf8N_BE(c)),
+            ),
+        ).toEqual(text);
+    });
+
+    test('decodeUtf8N_BE invalid', () => {
+        expect(decodeUtf8N_BE(0xff)).toBe(0xfffd);
+    });
+
+    test('decodeUtf8N_LE invalid', () => {
+        expect(decodeUtf8N_LE(0xff)).toBe(0xfffd);
+    });
+
+    test.each`
+        text    | expected
+        ${'a'}  | ${[0x61]}
+        ${'ab'} | ${[0x61, 0x62]}
+        ${'é'}  | ${[0xa9c3]}
+        ${'ë'}  | ${[0xabc3]}
+        ${'🇺🇸'} | ${[0xba87_9ff0, 0xb887_9ff0]}
+    `('encodeUtf8N_LE $text', ({ text, expected }) => {
+        const utf = textToCodePoints(text).map((cp) => encodeUtf8N_LE(cp));
+        expect(utf).toEqual(expected);
+        expect(
+            String.fromCodePoint(
+                ...utf
+                    .map((v) => v ^ ~1) // force it to be native
+                    .map((v) => v ^ ~1)
+                    .map((c) => decodeUtf8N_LE(c)),
+            ),
+        ).toEqual(text);
+    });
+
     test.each`
         value          | expected
         ${0xff}        | ${'0x0000_00ff'}
@@ -78,6 +152,66 @@ describe('Utf8Accumulator', () => {
         const data = encoder.encode(text);
 
         expect([...decodeUtf8ByteStream(data)]).toEqual([...text].map((c) => c.codePointAt(0)));
+
+        function* gen() {
+            yield* data;
+        }
+        expect([...decodeUtf8ByteStream(gen())]).toEqual([...text].map((c) => c.codePointAt(0)));
+    });
+
+    test('encodeTextToUtf8', () => {
+        const text = sampleText();
+        expect(encodeTextToUtf8(text)).toEqual([...encoder.encode(text)]);
+    });
+
+    test('decodeUtf8ByteStream', () => {
+        const text = sampleText();
+        expect(String.fromCodePoint(...decodeUtf8ByteStream(encoder.encode(text)))).toBe(text);
+    });
+
+    test('Utf8Accumulator isMultiByte', () => {
+        expect(Utf8Accumulator.isMultiByte(0x7f)).toBe(false);
+        expect(Utf8Accumulator.isMultiByte(0xf0)).toBe(true);
+        expect(Utf8Accumulator.isSingleByte(0x7f)).toBe(true);
+        expect(Utf8Accumulator.isSingleByte(0xf0)).toBe(false);
+    });
+
+    test('Utf8Accumulator', () => {
+        const acc = Utf8Accumulator.create();
+
+        expect(acc.decode(0x61)).toBe(0x61);
+        expect(acc.decode(0x61)).toBe(0x61);
+
+        // é
+        expect(acc.decode(0xc3)).toBe(undefined);
+        const cloneAcc = acc.clone();
+        expect(acc.decode(0xa9)).toBe('é'.codePointAt(0));
+        expect(acc.decode(0x61)).toBe(0x61);
+        // ë
+        expect(cloneAcc.decode(0xab)).toBe('ë'.codePointAt(0));
+
+        // out of order
+        expect(acc.decode(0xa9)).toBe(0xfffd);
+        expect(acc.decode(0xc3)).toBe(undefined);
+        acc.reset();
+
+        // two leads in a row
+        expect(acc.decode(0xc3)).toBe(undefined);
+        expect(acc.decode(0xc3)).toBe(0xfffd);
+        expect(acc.decode(0xa9)).toBe(0xfffd);
+
+        // two leads in a row
+        expect(acc.decode(0xc3)).toBe(undefined);
+        acc.reset();
+        expect(acc.decode(0xc3)).toBe(undefined);
+        expect(acc.decode(0xa9)).toBe('é'.codePointAt(0));
+    });
+});
+
+describe('textToCodePoints', () => {
+    test('textToCodePoints', () => {
+        const text = sampleText();
+        expect(textToCodePoints(text)).toEqual([...text].map((c) => c.codePointAt(0)));
     });
 });
 
 
@@ -1,3 +1,4 @@
+/* eslint-disable unicorn/prefer-code-point */
 /** A utf8 value represented as big endian 32bit number */
 export type Utf8BE32 = number;
 
@@ -102,20 +103,6 @@ export function decodeUtf8N_LE(utf8: Utf8LE32): CodePoint {
     return 0xfffd;
 }
 
-export function writeUtf8NtoBuffer(utf8: Utf8BE32, buffer: Uint8Array, offset: number): number {
-    const b0 = (utf8 >> 24) & 0xff;
-    const b1 = (utf8 >> 16) & 0xff;
-    const b2 = (utf8 >> 8) & 0xff;
-    const b3 = utf8 & 0xff;
-
-    let i = 0;
-    b0 && (buffer[offset + i++] = b0);
-    b1 && (buffer[offset + i++] = b1);
-    b2 && (buffer[offset + i++] = b2);
-    buffer[offset + i++] = b3;
-    return i;
-}
-
 export class Utf8Accumulator {
     remaining = 0;
     value = 0;
@@ -210,6 +197,94 @@ function* _decodeUtf8ByteStream(bytes: Iterable<number>): Iterable<CodePoint> {
     }
 }
 
+export function encodeUtf8into(code: CodePoint, into: Array<number> | Uint8Array, offset = 0): number {
+    if (code < 0x80) {
+        into[offset] = code;
+        return 1;
+    }
+    if (code < 0x800) {
+        const u = 0xc080 | ((code & 0x7c0) << 2) | (code & 0x3f);
+        into[offset] = u >>> 8;
+        into[offset + 1] = u & 0xff;
+        return 2;
+    }
+    if (code < 0x1_0000) {
+        const u = 0xe0_8080 | ((code & 0xf000) << 4) | ((code & 0x0fc0) << 2) | (code & 0x3f);
+        into[offset] = u >>> 16;
+        into[offset + 1] = (u >>> 8) & 0xff;
+        into[offset + 2] = u & 0xff;
+        return 3;
+    }
+    const u =
+        0xf080_8080 | (((code & 0x1c_0000) << 6) | ((code & 0x03_f000) << 4) | ((code & 0x0fc0) << 2) | (code & 0x3f));
+    into[offset] = (u >>> 24) & 0x0ff;
+    into[offset + 1] = (u >>> 16) & 0xff;
+    into[offset + 2] = (u >>> 8) & 0xff;
+    into[offset + 3] = u & 0xff;
+    return 4;
+}
+
+export function encodeTextToUtf8Into(text: string, into: Array<number> | Uint8Array, offset = 0): number {
+    let i = offset;
+    const len = text.length;
+    for (let j = 0; j < len; j++) {
+        let code = text.charCodeAt(j);
+        code = (code & 0xf800) === 0xd800 ? text.codePointAt(j++) || 0 : code;
+        if (code < 0x80) {
+            into[i++] = code;
+            continue;
+        }
+        if (code < 0x800) {
+            const u = 0xc080 | ((code & 0x7c0) << 2) | (code & 0x3f);
+            into[i++] = u >>> 8;
+            into[i++] = u & 0xff;
+            continue;
+        }
+        if (code < 0x1_0000) {
+            const u = 0xe0_8080 | ((code & 0xf000) << 4) | ((code & 0x0fc0) << 2) | (code & 0x3f);
+            into[i++] = u >>> 16;
+            into[i++] = (u >>> 8) & 0xff;
+            into[i++] = u & 0xff;
+            continue;
+        }
+        const u =
+            0xf080_8080 |
+            (((code & 0x1c_0000) << 6) | ((code & 0x03_f000) << 4) | ((code & 0x0fc0) << 2) | (code & 0x3f));
+        into[i++] = (u >>> 24) & 0x0ff;
+        into[i++] = (u >>> 16) & 0xff;
+        into[i++] = (u >>> 8) & 0xff;
+        into[i++] = u & 0xff;
+    }
+    return i - offset;
+}
+
+export function encodeTextToUtf8(text: string): number[] {
+    const array = new Array(text.length);
+    const len = encodeTextToUtf8Into(text, array);
+    array.length !== len && (array.length = len);
+    return array;
+}
+
+export function textToCodePoints(text: string): CodePoint[] {
+    const codePoints: CodePoint[] = new Array(text.length);
+    const len = text.length;
+    let j = 0;
+    for (let i = 0; i < len; i++) {
+        const code = text.charCodeAt(i);
+        codePoints[j++] = (code & 0xf800) === 0xd800 ? text.codePointAt(i++) || 0 : code;
+    }
+    codePoints.length = j;
+    return codePoints;
+}
+
+export function encodeCodePointsToUtf8Into(data: CodePoint[], into: Array<number> | Uint8Array, offset = 0): number {
+    let i = offset;
+    for (const code of data) {
+        i += encodeUtf8into(code, into, i);
+    }
+    return i - offset;
+}
+
 export function hex32(n: number): string {
     if (n < 0) n = 0x1_0000_0000 + n;
     const s = '0x' + n.toString(16).padStart(8, '0');
 
@@ -1,5 +1,6 @@
 import { suite } from 'perf-insight';
 
+import { encodeTextToUtf8 } from '../lib/TrieBlob/Utf8.js';
 import { readFastTrieBlobFromConfig, readTrieFromConfig } from '../test/dictionaries.test.helper.js';
 
 // const measureTimeout = 100;
@@ -24,14 +25,14 @@ suite('encode to sequence', async (test) => {
 
     test('trieBlob.wordToNodeCharIndexSequence' + msgSuffix, () => {
         for (const word of words) {
-            trieBlob.wordToNodeCharIndexSequence(word);
+            trieBlob.wordToUtf8Seq(word);
         }
     });
 
     test('trieBlob.wordToNodeCharIndexSequence x4' + msgSuffix, () => {
         for (const word of words) {
             for (let i = 0; i < 4; ++i) {
-                trieBlob.wordToNodeCharIndexSequence(word);
+                trieBlob.wordToUtf8Seq(word);
             }
         }
     });
@@ -42,9 +43,9 @@ suite('encode to sequence', async (test) => {
         }
     });
 
-    test('charIndex.__wordToCharIndexSequence' + msgSuffix, () => {
+    test('encodeTextToUtf8' + msgSuffix, () => {
         for (const word of words) {
-            charIndex.__wordToUtf8Seq(word);
+            encodeTextToUtf8(word);
         }
     });
 
 
@@ -36,8 +36,10 @@ const defaultConfig = {
                 '**/*.d.mts',
                 '**/*.d.ts',
                 '**/*.test.*',
+                '**/*.config.*',
                 '**/fixtures/**',
                 '**/perf/**',
+                '**/*.perf.*',
                 '**/samples/**',
                 '**/test*/**',
                 '**/test.*',
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,6 @@`
`1`	`1`	`import { suite } from 'perf-insight';`
`2`	`2`
	`3`	`+import { encodeTextToUtf8 } from '../lib/TrieBlob/Utf8.js';`
`3`	`4`	`import { readFastTrieBlobFromConfig, readTrieFromConfig } from '../test/dictionaries.test.helper.js';`
`4`	`5`
`5`	`6`	`// const measureTimeout = 100;`
`@@ -24,14 +25,14 @@ suite('encode to sequence', async (test) => {`
`24`	`25`
`25`	`26`	`test('trieBlob.wordToNodeCharIndexSequence' + msgSuffix, () => {`
`26`	`27`	`for (const word of words) {`
`27`		`- trieBlob.wordToNodeCharIndexSequence(word);`
	`28`	`+ trieBlob.wordToUtf8Seq(word);`
`28`	`29`	`}`
`29`	`30`	`});`
`30`	`31`
`31`	`32`	`test('trieBlob.wordToNodeCharIndexSequence x4' + msgSuffix, () => {`
`32`	`33`	`for (const word of words) {`
`33`	`34`	`for (let i = 0; i < 4; ++i) {`
`34`		`- trieBlob.wordToNodeCharIndexSequence(word);`
	`35`	`+ trieBlob.wordToUtf8Seq(word);`
`35`	`36`	`}`
`36`	`37`	`}`
`37`	`38`	`});`
`@@ -42,9 +43,9 @@ suite('encode to sequence', async (test) => {`
`42`	`43`	`}`
`43`	`44`	`});`
`44`	`45`
`45`		`- test('charIndex.__wordToCharIndexSequence' + msgSuffix, () => {`
	`46`	`+ test('encodeTextToUtf8' + msgSuffix, () => {`
`46`	`47`	`for (const word of words) {`
`47`		`- charIndex.__wordToUtf8Seq(word);`
	`48`	`+ encodeTextToUtf8(word);`
`48`	`49`	`}`
`49`	`50`	`});`
`50`	`51`