Skip to content

Commit 077b3ba

Browse files
authoredJul 16, 2024··
refactor: char index (#5926)
1 parent 9986720 commit 077b3ba

11 files changed

+431
-122
lines changed
 

‎packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.test.ts

+4-4
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,15 @@ describe('CharIndexBuilder', () => {
99
const letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'];
1010
const indexes = letters.map((c) => charIndexBuilder.getUtf8Value(c));
1111
expect(indexes).toEqual(letters.map((c) => c.codePointAt(0)));
12-
const r = charIndexBuilder.wordToUtf8Seq('abcdefghij');
13-
expect(r).toEqual([...textEncoder.encode('abcdefghij')]);
14-
expect(charIndexBuilder.size).toBe(11); // One extra for the empty string.
12+
const r = charIndexBuilder.wordToUtf8Seq('abcdefghij⚁⚂⚃⚄⚀');
13+
expect(r).toEqual([...textEncoder.encode('abcdefghij⚁⚂⚃⚄⚀')]);
14+
expect(charIndexBuilder.size).toBe(16); // One extra for the empty string.
1515

1616
// Add the same letters again.
1717
expect(letters.map((c) => charIndexBuilder.getUtf8Value(c))).toEqual(letters.map((c) => c.codePointAt(0)));
1818

1919
const charIndex = charIndexBuilder.build();
20-
expect(charIndex.size).toBe(11);
20+
expect(charIndex.size).toBe(16);
2121
expect(charIndex.wordToUtf8Seq('abcdefghij')).toEqual([...textEncoder.encode('abcdefghij')]);
2222
});
2323
});
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
import { encodeUtf8N_BE, type Utf8BE32 } from './Utf8.js';
1+
import { encodeTextToUtf8, encodeUtf8N_BE, type Utf8BE32 } from './Utf8.js';
22

33
export type Utf8Seq = Readonly<number[]>;
44

55
export type CharIndexMap = Record<string, Utf8BE32>;
66

77
export type RO_CharIndexMap = Readonly<CharIndexMap>;
88

9-
export type CharIndexSeqMap = Record<string, Utf8Seq | number>;
9+
export type CharIndexSeqMap = Record<string, Utf8Seq>;
1010

1111
export type RO_CharIndexSeqMap = Readonly<CharIndexSeqMap>;
1212

@@ -15,52 +15,29 @@ const emptySeq: Utf8Seq = [0];
1515
Object.freeze(emptySeq);
1616

1717
export class CharIndex {
18-
readonly charToUtf8Map: RO_CharIndexMap;
19-
readonly charToUtf8SeqMap: RO_CharIndexSeqMap;
18+
#charToUtf8SeqMap: CharIndexSeqMap;
2019

2120
#lastWord = '';
2221
#lastWordSeq: Utf8Seq = [];
22+
#multiByteChars: boolean;
2323

2424
constructor(readonly charIndex: readonly string[]) {
25-
this.charToUtf8Map = buildCharIndexMap(charIndex);
26-
this.charToUtf8SeqMap = buildCharIndexSequenceMap(this.charToUtf8Map);
27-
}
28-
29-
getUtf8Value(c: string): number {
30-
return this.charToUtf8Map[c] || 0;
25+
this.#charToUtf8SeqMap = buildCharIndexSequenceMap(charIndex);
26+
this.#multiByteChars = Object.values(this.#charToUtf8SeqMap).some((c) => c.length > 1);
3127
}
3228

3329
getCharUtf8Seq(c: string): Utf8Seq {
34-
const r = this.charToUtf8SeqMap[c] ?? emptySeq;
35-
return typeof r === 'number' ? [r] : r;
36-
}
37-
38-
__wordToUtf8Seq(word: string): Utf8Seq {
39-
// Note: Array.flatMap is very slow
40-
const seq: number[] = new Array(word.length);
41-
let i = 0;
42-
for (const c of word) {
43-
const cSep = this.charToUtf8SeqMap[c];
44-
if (typeof cSep === 'number') {
45-
seq[i++] = cSep;
46-
continue;
47-
}
48-
if (!cSep) {
49-
seq[i++] = 0;
50-
continue;
51-
}
52-
for (const cIdx of cSep) {
53-
seq[i++] = cIdx;
54-
}
55-
}
56-
if (seq.length !== i) seq.length = i;
57-
return seq;
30+
const found = this.#charToUtf8SeqMap[c];
31+
if (found) return found;
32+
const s = encodeTextToUtf8(c);
33+
this.#charToUtf8SeqMap[c] = s;
34+
return s;
5835
}
5936

6037
wordToUtf8Seq(word: string): Utf8Seq {
6138
if (this.#lastWord === word) return this.#lastWordSeq;
6239

63-
const seq = this.__wordToUtf8Seq(word);
40+
const seq = encodeTextToUtf8(word);
6441

6542
this.#lastWord = word;
6643
this.#lastWordSeq = seq;
@@ -69,7 +46,7 @@ export class CharIndex {
6946
}
7047

7148
indexContainsMultiByteChars(): boolean {
72-
return Object.values(this.charToUtf8Map).some((v) => v >= 0x80);
49+
return this.#multiByteChars;
7350
}
7451

7552
get size(): number {
@@ -81,22 +58,10 @@ export class CharIndex {
8158
}
8259
}
8360

84-
function buildCharIndexMap(charIndex: readonly string[]): CharIndexMap {
85-
const map: CharIndexMap = Object.create(null);
86-
for (const c of charIndex) {
87-
const cn = c.normalize('NFC');
88-
const utf8 = encodeUtf8N_BE(cn.codePointAt(0) || 0);
89-
map[c] = utf8;
90-
map[c.normalize('NFC')] = utf8;
91-
map[c.normalize('NFD')] = utf8;
92-
}
93-
return map;
94-
}
95-
96-
function buildCharIndexSequenceMap(charIndexMap: RO_CharIndexMap): CharIndexSeqMap {
61+
function buildCharIndexSequenceMap(charIndex: readonly string[]): CharIndexSeqMap {
9762
const map: CharIndexSeqMap = Object.create(null);
98-
for (const [key, value] of Object.entries(charIndexMap)) {
99-
map[key] = splitUtf8IfNeeded(value);
63+
for (const key of charIndex) {
64+
map[key] = encodeTextToUtf8(key);
10065
}
10166
return map;
10267
}
@@ -106,7 +71,7 @@ export class CharIndexBuilder {
10671
readonly charIndexMap: CharIndexMap = Object.create(null);
10772
readonly charIndexSeqMap: CharIndexSeqMap = Object.create(null);
10873

109-
readonly #mapIdxToSeq = new Map<number, number[] | number>();
74+
readonly #mapIdxToSeq = new Map<number, number[]>();
11075

11176
constructor() {
11277
this.getUtf8Value('');
@@ -126,24 +91,22 @@ export class CharIndexBuilder {
12691
return utf8;
12792
}
12893

129-
utf8ValueToUtf8Seq(idx: number): number[] | number {
94+
utf8ValueToUtf8Seq(idx: number): number[] {
13095
const found = this.#mapIdxToSeq.get(idx);
13196
if (found !== undefined) {
13297
return found;
13398
}
134-
const seq = splitUtf8IfNeeded(idx);
99+
const seq = splitUtf8(idx);
135100
this.#mapIdxToSeq.set(idx, seq);
136101
return seq;
137102
}
138103

139104
charToUtf8Seq(c: string): number[] {
140105
const idx = this.getUtf8Value(c);
141-
const s = this.utf8ValueToUtf8Seq(idx);
142-
return typeof s === 'number' ? [s] : s;
106+
return this.utf8ValueToUtf8Seq(idx);
143107
}
144108

145109
wordToUtf8Seq(word: string): number[] {
146-
// word = word.normalize('NFC');
147110
const seq: number[] = new Array(word.length);
148111
let i = 0;
149112
for (const c of word) {
@@ -170,8 +133,9 @@ export class CharIndexBuilder {
170133
}
171134
}
172135

173-
function splitUtf8IfNeeded(utf8: number): number | number[] {
174-
if (utf8 < 0x80) return utf8;
175-
const s = [(utf8 >> 24) & 0xff, (utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff].filter((v) => v);
176-
return s.length ? s : s[0];
136+
function splitUtf8(utf8: number): number[] {
137+
if (utf8 <= 0xff) return [utf8];
138+
if (utf8 <= 0xffff) return [(utf8 >> 8) & 0xff, utf8 & 0xff];
139+
if (utf8 <= 0xff_ffff) return [(utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff];
140+
return [(utf8 >> 24) & 0xff, (utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff].filter((v) => v);
177141
}

‎packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ export class FastTrieBlob implements TrieData {
214214

215215
static toITrieNodeRoot(trie: FastTrieBlob): ITrieNodeRoot {
216216
return new FastTrieBlobIRoot(
217-
new FastTrieBlobInternals(trie.nodes, trie._charIndex, trie.bitMasksInfo, trie.sorted),
217+
new FastTrieBlobInternals(trie.nodes, trie._charIndex, trie.bitMasksInfo),
218218
0,
219219
trie.info,
220220
);

‎packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobBuilder.ts

-33
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,6 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
111111
for (let i = 0; i < utf8Seq.length; ++i) {
112112
insertCharIndexes(utf8Seq[i], pDepth);
113113
}
114-
// dumpState({ step: 'insertChar', char });
115114
};
116115

117116
/**
@@ -174,8 +173,6 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
174173
const pos = s.pos;
175174
const node = nodes[nodeIdx];
176175
node[pos] = (refNodeIdx << NodeChildRefShift) | (node[pos] & LetterMask);
177-
178-
// dumpState({ step: 'reference', refId, refNodeIdx });
179176
};
180177

181178
const backStep = (num: number) => {
@@ -186,38 +183,8 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
186183
depth = stack[depth].pDepth;
187184
}
188185
nodeIdx = stack[depth + 1].nodeIdx;
189-
190-
// dumpState({ step: 'backStep', num });
191186
};
192187

193-
// function dumpNode(node: number[]): string {
194-
// const n = node
195-
// .map((n, i) => {
196-
// if (!i) return `w: ${(n & NodeMaskEOW && 1) || 0}`;
197-
// return `{ c: ${(n & LetterMask).toString(16).padStart(2, '0')}, r: ${n >>> NodeChildRefShift} }`;
198-
// })
199-
// .join(', ');
200-
// return `[${n}]`;
201-
// }
202-
203-
// function dumpNodes(nodes: FastTrieBlobNode[]) {
204-
// return nodes.map((n, i) => `${i}: ${dumpNode(n)}`);
205-
// }
206-
207-
// const debug = false;
208-
209-
// function dumpState(extra?: Record<string, unknown>) {
210-
// debug &&
211-
// console.warn('%o', {
212-
// stack: stack.slice(0, depth + 1),
213-
// nodes: dumpNodes(nodes),
214-
// nodeIdx,
215-
// depth,
216-
// refNodes,
217-
// ...extra,
218-
// });
219-
// }
220-
221188
const c: BuilderCursor = {
222189
insertChar,
223190
markEOW,

‎packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobInternals.ts

+5-2
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,13 @@ export class FastTrieBlobInternals implements FastTrieBlobBitMaskInfo {
1212
readonly nodes: number[][],
1313
readonly charIndex: CharIndex,
1414
maskInfo: FastTrieBlobBitMaskInfo,
15-
sorted = false,
1615
) {
1716
const { NodeMaskEOW, NodeMaskChildCharIndex, NodeChildRefShift } = maskInfo;
1817
this.NodeMaskEOW = NodeMaskEOW;
1918
this.NodeMaskChildCharIndex = NodeMaskChildCharIndex;
2019
this.NodeChildRefShift = NodeChildRefShift;
2120
this.isIndexDecoderNeeded = charIndex.indexContainsMultiByteChars();
22-
!sorted && sortNodes(nodes, this.NodeMaskChildCharIndex);
21+
sortNodes(nodes, this.NodeMaskChildCharIndex);
2322
}
2423
}
2524

@@ -30,6 +29,10 @@ export class FastTrieBlobInternals implements FastTrieBlobBitMaskInfo {
3029
* @returns
3130
*/
3231
export function sortNodes(nodes: number[][], mask: number): number[][] {
32+
if (Object.isFrozen(nodes)) {
33+
assertSorted(nodes, mask);
34+
return nodes;
35+
}
3336
for (let i = 0; i < nodes.length; ++i) {
3437
let node = nodes[i];
3538
if (node.length > 2) {

‎packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlob.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ export class TrieBlob implements TrieData {
8282
this.#nonStrictIdx = this._lookupNode(0, this.info.stripCaseAndAccentsPrefix);
8383
}
8484

85-
public wordToNodeCharIndexSequence(word: string): Utf8Seq {
85+
public wordToUtf8Seq(word: string): Utf8Seq {
8686
return this.charIndex.wordToUtf8Seq(word);
8787
}
8888

@@ -159,7 +159,7 @@ export class TrieBlob implements TrieData {
159159
const NodeChildRefShift = TrieBlob.NodeChildRefShift;
160160
const nodes = this.nodes;
161161
const nodes8 = this.#nodes8;
162-
const wordIndexes = this.wordToNodeCharIndexSequence(word);
162+
const wordIndexes = this.wordToUtf8Seq(word);
163163
const lookup = this.#nodeIdxLookup;
164164
const len = wordIndexes.length;
165165
let p = 0;

‎packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.perf.ts

+164-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,24 @@
11
import { suite } from 'perf-insight';
22

3-
import { decodeUtf8ByteStream, decodeUtf8N_BE, decodeUtf8N_LE, encodeUtf8N_BE, encodeUtf8N_LE } from './Utf8.js';
3+
import {
4+
decodeUtf8ByteStream,
5+
decodeUtf8N_BE,
6+
decodeUtf8N_LE,
7+
encodeCodePointsToUtf8Into,
8+
encodeTextToUtf8,
9+
encodeTextToUtf8Into,
10+
encodeUtf8N_BE,
11+
encodeUtf8N_LE,
12+
textToCodePoints,
13+
} from './Utf8.js';
414

515
suite('Utf8 encode/decode', async (test) => {
616
const iterations = 1000;
717
const text = sampleText();
18+
const words = text.split(/\s+/).filter((a) => !!a);
19+
const wordsCP = words.map((word) => [...word].map((char) => char.codePointAt(0) || 0));
820
const chars = [...text];
21+
const codePoints = chars.map((char) => char.codePointAt(0) || 0);
922
const encoder = new TextEncoder();
1023
const decoder = new TextDecoder();
1124
const scratchBuffer = new Uint8Array(1024);
@@ -29,6 +42,16 @@ suite('Utf8 encode/decode', async (test) => {
2942
}
3043
});
3144

45+
test('TextEncoder.encodeInto by char', () => {
46+
const buffer = new Uint8Array(scratchBuffer.buffer, 0, 4);
47+
for (let i = iterations; i > 0; --i) {
48+
for (const char of chars) {
49+
buffer[0] = 0;
50+
encoder.encodeInto(char, buffer);
51+
}
52+
}
53+
});
54+
3255
test('encodeUtf8N_BE', () => {
3356
for (let i = iterations; i > 0; --i) {
3457
for (const char of chars) {
@@ -50,6 +73,146 @@ suite('Utf8 encode/decode', async (test) => {
5073
}
5174
}
5275
});
76+
77+
test('TextEncoder.encodeInto text', () => {
78+
const buffer = scratchBuffer;
79+
const _text = text;
80+
for (let i = iterations; i > 0; --i) {
81+
encoder.encodeInto(_text, buffer);
82+
}
83+
});
84+
85+
test('Buffer.write text', () => {
86+
const buffer = Buffer.from(scratchBuffer.buffer);
87+
// const _text = text;
88+
for (let i = iterations; i > 0; --i) {
89+
buffer.write(text, 'utf16le');
90+
}
91+
});
92+
93+
test('encodeCodePointsInto', () => {
94+
const buffer = scratchBuffer;
95+
const points = codePoints;
96+
for (let i = iterations; i > 0; --i) {
97+
encodeCodePointsToUtf8Into(points, buffer);
98+
}
99+
});
100+
101+
test(`TextEncoder.encodeInto words (${words.length})`, () => {
102+
const buffer = scratchBuffer;
103+
const _words = words;
104+
for (let i = iterations; i > 0; --i) {
105+
for (const word of _words) {
106+
encoder.encodeInto(word, buffer);
107+
}
108+
}
109+
});
110+
111+
test(`encodeCodePointsInto wordsCP (${words.length})`, () => {
112+
const buffer = scratchBuffer;
113+
const words = wordsCP;
114+
for (let i = iterations; i > 0; --i) {
115+
for (const points of words) {
116+
encodeCodePointsToUtf8Into(points, buffer);
117+
}
118+
}
119+
});
120+
121+
test(`encodeCodePointsInto Array wordsCP (${words.length})`, () => {
122+
const buffer = new Array(100);
123+
const words = wordsCP;
124+
for (let i = iterations; i > 0; --i) {
125+
for (const points of words) {
126+
encodeCodePointsToUtf8Into(points, buffer);
127+
}
128+
}
129+
});
130+
131+
test(`encodeCodePointsInto wordsCP .codePointAt (${words.length})`, () => {
132+
const buffer = scratchBuffer;
133+
const _words = words;
134+
for (let i = iterations; i > 0; --i) {
135+
for (const word of _words) {
136+
encodeCodePointsToUtf8Into(
137+
[...word].map((a) => a.codePointAt(0) || 0),
138+
buffer,
139+
);
140+
}
141+
}
142+
});
143+
144+
test(`encodeTextToUtf8Into Uint8Array words (${words.length})`, () => {
145+
const buffer = scratchBuffer;
146+
const _words = words;
147+
for (let i = iterations; i > 0; --i) {
148+
for (const word of _words) {
149+
encodeTextToUtf8Into(word, buffer);
150+
}
151+
}
152+
});
153+
154+
test(`encodeTextToUtf8Into array words (${words.length})`, () => {
155+
const buffer = new Array(100);
156+
const _words = words;
157+
for (let i = iterations; i > 0; --i) {
158+
for (const word of _words) {
159+
encodeTextToUtf8Into(word, buffer);
160+
}
161+
}
162+
});
163+
164+
test(`encoder.encode(word) to array words (${words.length})`, () => {
165+
const _words = words;
166+
for (let i = iterations; i > 0; --i) {
167+
for (const word of _words) {
168+
[...encoder.encode(word)];
169+
}
170+
}
171+
});
172+
173+
test(`encodeTextToUtf8 array words (${words.length})`, () => {
174+
const _words = words;
175+
for (let i = iterations; i > 0; --i) {
176+
for (const word of _words) {
177+
encodeTextToUtf8(word);
178+
}
179+
}
180+
});
181+
182+
const charToUtf8Map = new Map<string, number[]>(
183+
[...new Set([...sampleText()])].map((char) => [char, encodeTextToUtf8(char)] as const),
184+
);
185+
186+
test(`encodeTextToUtf8 to array with lookup (${words.length})`, () => {
187+
const _words = words;
188+
for (let i = iterations; i > 0; --i) {
189+
for (const word of _words) {
190+
const a: number[] = new Array(word.length * 2);
191+
let i = 0;
192+
for (const c of word) {
193+
const u8 = charToUtf8Map.get(c);
194+
for (const u of u8 || []) {
195+
a[i++] = u;
196+
}
197+
}
198+
a.length = i;
199+
}
200+
}
201+
});
202+
203+
test('textToCodePoints', () => {
204+
const _text = text;
205+
for (let i = iterations; i > 0; --i) {
206+
textToCodePoints(_text);
207+
}
208+
});
209+
210+
test('textToCodePoints map', () => {
211+
const _text = text;
212+
for (let i = iterations; i > 0; --i) {
213+
[..._text].map((a) => a.codePointAt(0) || 0);
214+
}
215+
});
53216
});
54217

55218
suite('Utf8 decode buffer', async (test) => {

‎packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.test.ts

+134
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,12 @@ import {
66
decodeUtf8ByteStream,
77
decodeUtf8N_BE,
88
decodeUtf8N_LE,
9+
encodeCodePointsToUtf8Into,
10+
encodeTextToUtf8,
911
encodeUtf8N_BE,
1012
encodeUtf8N_LE,
1113
hex32,
14+
textToCodePoints,
1215
Utf8Accumulator,
1316
} from './Utf8.js';
1417

@@ -43,6 +46,77 @@ describe('Utf8 lib', () => {
4346
}
4447
});
4548

49+
test.each`
50+
text | expected
51+
${'a'} | ${[0x61]}
52+
${'ab'} | ${[0x61, 0x62]}
53+
${'é'} | ${[195, 169]}
54+
${'🇺🇸'} | ${[240, 159, 135, 186, 240, 159, 135, 184]}
55+
`('encodeTextToUtf8 $text', ({ text, expected }) => {
56+
expect(encodeTextToUtf8(text)).toEqual(expected);
57+
expect(encodeTextToUtf8(text)).toEqual([...encoder.encode(text)]);
58+
59+
const scratch: number[] = [];
60+
const len = encodeCodePointsToUtf8Into(textToCodePoints(text), scratch);
61+
expect(scratch.slice(0, len)).toEqual(expected);
62+
});
63+
64+
test('encodeCodePointsToUtf8Into', () => {
65+
const decoder = new TextDecoder();
66+
const text = sampleText();
67+
const scratch: number[] = [];
68+
const len = encodeCodePointsToUtf8Into(textToCodePoints(text), scratch);
69+
const buf = new Uint8Array(scratch.slice(0, len));
70+
expect(decoder.decode(buf)).toBe(text);
71+
});
72+
73+
test.each`
74+
text | expected
75+
${'a'} | ${[0x61]}
76+
${'ab'} | ${[0x61, 0x62]}
77+
${'é'} | ${[0xc3a9]}
78+
${'🇺🇸'} | ${[0xf09f_87ba, 0xf09f_87b8]}
79+
`('encodeUtf8N_BE $text', ({ text, expected }) => {
80+
const utf = textToCodePoints(text).map((cp) => encodeUtf8N_BE(cp));
81+
expect(utf).toEqual(expected);
82+
expect(
83+
String.fromCodePoint(
84+
...utf
85+
.map((v) => v ^ ~1) // force it to be native
86+
.map((v) => v ^ ~1)
87+
.map((c) => decodeUtf8N_BE(c)),
88+
),
89+
).toEqual(text);
90+
});
91+
92+
test('decodeUtf8N_BE invalid', () => {
93+
expect(decodeUtf8N_BE(0xff)).toBe(0xfffd);
94+
});
95+
96+
test('decodeUtf8N_LE invalid', () => {
97+
expect(decodeUtf8N_LE(0xff)).toBe(0xfffd);
98+
});
99+
100+
test.each`
101+
text | expected
102+
${'a'} | ${[0x61]}
103+
${'ab'} | ${[0x61, 0x62]}
104+
${'é'} | ${[0xa9c3]}
105+
${'ë'} | ${[0xabc3]}
106+
${'🇺🇸'} | ${[0xba87_9ff0, 0xb887_9ff0]}
107+
`('encodeUtf8N_LE $text', ({ text, expected }) => {
108+
const utf = textToCodePoints(text).map((cp) => encodeUtf8N_LE(cp));
109+
expect(utf).toEqual(expected);
110+
expect(
111+
String.fromCodePoint(
112+
...utf
113+
.map((v) => v ^ ~1) // force it to be native
114+
.map((v) => v ^ ~1)
115+
.map((c) => decodeUtf8N_LE(c)),
116+
),
117+
).toEqual(text);
118+
});
119+
46120
test.each`
47121
value | expected
48122
${0xff} | ${'0x0000_00ff'}
@@ -78,6 +152,66 @@ describe('Utf8Accumulator', () => {
78152
const data = encoder.encode(text);
79153

80154
expect([...decodeUtf8ByteStream(data)]).toEqual([...text].map((c) => c.codePointAt(0)));
155+
156+
function* gen() {
157+
yield* data;
158+
}
159+
expect([...decodeUtf8ByteStream(gen())]).toEqual([...text].map((c) => c.codePointAt(0)));
160+
});
161+
162+
test('encodeTextToUtf8', () => {
163+
const text = sampleText();
164+
expect(encodeTextToUtf8(text)).toEqual([...encoder.encode(text)]);
165+
});
166+
167+
test('decodeUtf8ByteStream', () => {
168+
const text = sampleText();
169+
expect(String.fromCodePoint(...decodeUtf8ByteStream(encoder.encode(text)))).toBe(text);
170+
});
171+
172+
test('Utf8Accumulator isMultiByte', () => {
173+
expect(Utf8Accumulator.isMultiByte(0x7f)).toBe(false);
174+
expect(Utf8Accumulator.isMultiByte(0xf0)).toBe(true);
175+
expect(Utf8Accumulator.isSingleByte(0x7f)).toBe(true);
176+
expect(Utf8Accumulator.isSingleByte(0xf0)).toBe(false);
177+
});
178+
179+
test('Utf8Accumulator', () => {
180+
const acc = Utf8Accumulator.create();
181+
182+
expect(acc.decode(0x61)).toBe(0x61);
183+
expect(acc.decode(0x61)).toBe(0x61);
184+
185+
// é
186+
expect(acc.decode(0xc3)).toBe(undefined);
187+
const cloneAcc = acc.clone();
188+
expect(acc.decode(0xa9)).toBe('é'.codePointAt(0));
189+
expect(acc.decode(0x61)).toBe(0x61);
190+
// ë
191+
expect(cloneAcc.decode(0xab)).toBe('ë'.codePointAt(0));
192+
193+
// out of order
194+
expect(acc.decode(0xa9)).toBe(0xfffd);
195+
expect(acc.decode(0xc3)).toBe(undefined);
196+
acc.reset();
197+
198+
// two leads in a row
199+
expect(acc.decode(0xc3)).toBe(undefined);
200+
expect(acc.decode(0xc3)).toBe(0xfffd);
201+
expect(acc.decode(0xa9)).toBe(0xfffd);
202+
203+
// two leads in a row
204+
expect(acc.decode(0xc3)).toBe(undefined);
205+
acc.reset();
206+
expect(acc.decode(0xc3)).toBe(undefined);
207+
expect(acc.decode(0xa9)).toBe('é'.codePointAt(0));
208+
});
209+
});
210+
211+
describe('textToCodePoints', () => {
212+
test('textToCodePoints', () => {
213+
const text = sampleText();
214+
expect(textToCodePoints(text)).toEqual([...text].map((c) => c.codePointAt(0)));
81215
});
82216
});
83217

‎packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.ts

+89-14
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
/* eslint-disable unicorn/prefer-code-point */
12
/** A utf8 value represented as big endian 32bit number */
23
export type Utf8BE32 = number;
34

@@ -102,20 +103,6 @@ export function decodeUtf8N_LE(utf8: Utf8LE32): CodePoint {
102103
return 0xfffd;
103104
}
104105

105-
export function writeUtf8NtoBuffer(utf8: Utf8BE32, buffer: Uint8Array, offset: number): number {
106-
const b0 = (utf8 >> 24) & 0xff;
107-
const b1 = (utf8 >> 16) & 0xff;
108-
const b2 = (utf8 >> 8) & 0xff;
109-
const b3 = utf8 & 0xff;
110-
111-
let i = 0;
112-
b0 && (buffer[offset + i++] = b0);
113-
b1 && (buffer[offset + i++] = b1);
114-
b2 && (buffer[offset + i++] = b2);
115-
buffer[offset + i++] = b3;
116-
return i;
117-
}
118-
119106
export class Utf8Accumulator {
120107
remaining = 0;
121108
value = 0;
@@ -210,6 +197,94 @@ function* _decodeUtf8ByteStream(bytes: Iterable<number>): Iterable<CodePoint> {
210197
}
211198
}
212199

200+
export function encodeUtf8into(code: CodePoint, into: Array<number> | Uint8Array, offset = 0): number {
201+
if (code < 0x80) {
202+
into[offset] = code;
203+
return 1;
204+
}
205+
if (code < 0x800) {
206+
const u = 0xc080 | ((code & 0x7c0) << 2) | (code & 0x3f);
207+
into[offset] = u >>> 8;
208+
into[offset + 1] = u & 0xff;
209+
return 2;
210+
}
211+
if (code < 0x1_0000) {
212+
const u = 0xe0_8080 | ((code & 0xf000) << 4) | ((code & 0x0fc0) << 2) | (code & 0x3f);
213+
into[offset] = u >>> 16;
214+
into[offset + 1] = (u >>> 8) & 0xff;
215+
into[offset + 2] = u & 0xff;
216+
return 3;
217+
}
218+
const u =
219+
0xf080_8080 | (((code & 0x1c_0000) << 6) | ((code & 0x03_f000) << 4) | ((code & 0x0fc0) << 2) | (code & 0x3f));
220+
into[offset] = (u >>> 24) & 0x0ff;
221+
into[offset + 1] = (u >>> 16) & 0xff;
222+
into[offset + 2] = (u >>> 8) & 0xff;
223+
into[offset + 3] = u & 0xff;
224+
return 4;
225+
}
226+
227+
export function encodeTextToUtf8Into(text: string, into: Array<number> | Uint8Array, offset = 0): number {
228+
let i = offset;
229+
const len = text.length;
230+
for (let j = 0; j < len; j++) {
231+
let code = text.charCodeAt(j);
232+
code = (code & 0xf800) === 0xd800 ? text.codePointAt(j++) || 0 : code;
233+
if (code < 0x80) {
234+
into[i++] = code;
235+
continue;
236+
}
237+
if (code < 0x800) {
238+
const u = 0xc080 | ((code & 0x7c0) << 2) | (code & 0x3f);
239+
into[i++] = u >>> 8;
240+
into[i++] = u & 0xff;
241+
continue;
242+
}
243+
if (code < 0x1_0000) {
244+
const u = 0xe0_8080 | ((code & 0xf000) << 4) | ((code & 0x0fc0) << 2) | (code & 0x3f);
245+
into[i++] = u >>> 16;
246+
into[i++] = (u >>> 8) & 0xff;
247+
into[i++] = u & 0xff;
248+
continue;
249+
}
250+
const u =
251+
0xf080_8080 |
252+
(((code & 0x1c_0000) << 6) | ((code & 0x03_f000) << 4) | ((code & 0x0fc0) << 2) | (code & 0x3f));
253+
into[i++] = (u >>> 24) & 0x0ff;
254+
into[i++] = (u >>> 16) & 0xff;
255+
into[i++] = (u >>> 8) & 0xff;
256+
into[i++] = u & 0xff;
257+
}
258+
return i - offset;
259+
}
260+
261+
export function encodeTextToUtf8(text: string): number[] {
262+
const array = new Array(text.length);
263+
const len = encodeTextToUtf8Into(text, array);
264+
array.length !== len && (array.length = len);
265+
return array;
266+
}
267+
268+
export function textToCodePoints(text: string): CodePoint[] {
269+
const codePoints: CodePoint[] = new Array(text.length);
270+
const len = text.length;
271+
let j = 0;
272+
for (let i = 0; i < len; i++) {
273+
const code = text.charCodeAt(i);
274+
codePoints[j++] = (code & 0xf800) === 0xd800 ? text.codePointAt(i++) || 0 : code;
275+
}
276+
codePoints.length = j;
277+
return codePoints;
278+
}
279+
280+
export function encodeCodePointsToUtf8Into(data: CodePoint[], into: Array<number> | Uint8Array, offset = 0): number {
281+
let i = offset;
282+
for (const code of data) {
283+
i += encodeUtf8into(code, into, i);
284+
}
285+
return i - offset;
286+
}
287+
213288
export function hex32(n: number): string {
214289
if (n < 0) n = 0x1_0000_0000 + n;
215290
const s = '0x' + n.toString(16).padStart(8, '0');

‎packages/cspell-trie-lib/src/perf/charIndex.perf.ts

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { suite } from 'perf-insight';
22

3+
import { encodeTextToUtf8 } from '../lib/TrieBlob/Utf8.js';
34
import { readFastTrieBlobFromConfig, readTrieFromConfig } from '../test/dictionaries.test.helper.js';
45

56
// const measureTimeout = 100;
@@ -24,14 +25,14 @@ suite('encode to sequence', async (test) => {
2425

2526
test('trieBlob.wordToNodeCharIndexSequence' + msgSuffix, () => {
2627
for (const word of words) {
27-
trieBlob.wordToNodeCharIndexSequence(word);
28+
trieBlob.wordToUtf8Seq(word);
2829
}
2930
});
3031

3132
test('trieBlob.wordToNodeCharIndexSequence x4' + msgSuffix, () => {
3233
for (const word of words) {
3334
for (let i = 0; i < 4; ++i) {
34-
trieBlob.wordToNodeCharIndexSequence(word);
35+
trieBlob.wordToUtf8Seq(word);
3536
}
3637
}
3738
});
@@ -42,9 +43,9 @@ suite('encode to sequence', async (test) => {
4243
}
4344
});
4445

45-
test('charIndex.__wordToCharIndexSequence' + msgSuffix, () => {
46+
test('encodeTextToUtf8' + msgSuffix, () => {
4647
for (const word of words) {
47-
charIndex.__wordToUtf8Seq(word);
48+
encodeTextToUtf8(word);
4849
}
4950
});
5051

‎vitest.config.mjs

+2
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,10 @@ const defaultConfig = {
3636
'**/*.d.mts',
3737
'**/*.d.ts',
3838
'**/*.test.*',
39+
'**/*.config.*',
3940
'**/fixtures/**',
4041
'**/perf/**',
42+
'**/*.perf.*',
4143
'**/samples/**',
4244
'**/test*/**',
4345
'**/test.*',

0 commit comments

Comments
 (0)
Please sign in to comment.