Skip to content

Commit

Permalink
fix: Implement Weighted Distance Algorithm (#2255)
Browse files Browse the repository at this point in the history
* doc: add comments to suggest
* dev: First pass at Levenshtein
* dev: Add an A* distance calculation function.
* dev: Implement weighted distance
  • Loading branch information
Jason3S committed Jan 15, 2022
1 parent cf2745c commit 3a4f9db
Show file tree
Hide file tree
Showing 12 changed files with 641 additions and 17 deletions.
5 changes: 3 additions & 2 deletions cspell-dict.txt
Expand Up @@ -2,14 +2,15 @@ alexiosc
backreference
bitjson
cheatsheets
Codecov
codecov
Codecov
codeql
COMPOUNDFLAG
coverallsapp
cspellcache
DAWG
Damerau
Dawg
DAWG
deserializers
exonum
gimu
Expand Down
18 changes: 18 additions & 0 deletions packages/cspell-trie-lib/src/lib/suggestions/distanceAStar.test.ts
@@ -0,0 +1,18 @@
import { distanceAStar } from './distanceAStar';
import { levenshteinDistance } from './levenshtein';

describe('distanceAStar', () => {
test.each`
wordA | wordB
${''} | ${''}
${'apple'} | ${'apple'}
${'apple'} | ${''}
${'apple'} | ${'apples'}
${'apple'} | ${'maple'}
${'grapple'} | ${'maples'}
`('distanceAStar vs Levenshtein "$wordA" "$wordB"', ({ wordA, wordB }) => {
const expected = levenshteinDistance(wordA, wordB) * 100;
expect(distanceAStar(wordA, wordB)).toBe(expected);
expect(distanceAStar(wordB, wordA)).toBe(expected);
});
});
75 changes: 75 additions & 0 deletions packages/cspell-trie-lib/src/lib/suggestions/distanceAStar.ts
@@ -0,0 +1,75 @@
import { PairingHeap } from '../utils/PairingHeap';

/**
* Calculate the edit distance between two words using an A* algorithm.
*
* Using basic weights, this algorithm has the same results as the Damerau-Levenshtein algorithm.
*/
export function distanceAStar(a: string, b: string): number {
const aN = a.length;
const bN = b.length;
const cost = 100;

const candidates = new PairingHeap(compare);

candidates.add({ ai: 0, bi: 0, c: 0 });

function opSub(n: Node) {
const { ai, bi, c } = n;
if (ai < aN && bi < bN) {
const cc = a[ai] === b[bi] ? c : c + cost;
candidates.add({ ai: ai + 1, bi: bi + 1, c: cc });
}
}

function opIns(n: Node) {
const { ai, bi, c } = n;
if (bi < bN) {
candidates.add({ ai: ai, bi: bi + 1, c: c + cost });
}
}

function opDel(n: Node) {
const { ai, bi, c } = n;
if (ai < aN) {
candidates.add({ ai: ai + 1, bi: bi, c: c + cost });
}
}

function opSwap(n: Node) {
const { ai, bi, c } = n;
if (a[ai] === b[bi + 1] && a[ai + 1] === b[bi]) {
candidates.add({ ai: ai + 2, bi: bi + 2, c: c + cost });
}
}

let best: Node | undefined;
// const bc2 = 2 * bc;
while ((best = candidates.dequeue())) {
if (best.ai === aN && best.bi === bN) break;

opSwap(best);
opIns(best);
opDel(best);
opSub(best);
}

return best?.c ?? -1;
}

interface Pos {
/** the offset in string `a` */
ai: number;
/** the offset in string `b` */
bi: number;
}

interface Node extends Pos {
/** the current cost */
c: number;
}

function compare(a: Node, b: Node): number {
// Choose lowest cost or farthest Manhattan distance.
return a.c - b.c || b.ai + b.bi - a.ai - a.bi;
}
@@ -0,0 +1,42 @@
import { distanceAStarWeighted } from './distanceAStarWeighted';
import { levenshteinDistance } from './levenshtein';
import { buildWeightedMapTrie } from './weightedMaps';

describe('distanceAStar', () => {
test.each`
wordA | wordB
${''} | ${''}
${'apple'} | ${'apple'}
${'apple'} | ${''}
${'apple'} | ${'apples'}
${'apple'} | ${'maple'}
${'grapple'} | ${'maples'}
`('distanceAStar vs Levenshtein "$wordA" "$wordB"', ({ wordA, wordB }) => {
const expected = levenshteinDistance(wordA, wordB) * 100;
expect(distanceAStarWeighted(wordA, wordB, {})).toBe(expected);
expect(distanceAStarWeighted(wordB, wordA, {})).toBe(expected);
});

// cspell:ignore aeiou
test.each`
wordA | wordB | map | expected
${''} | ${''} | ${undefined} | ${0}
${'apple'} | ${'apple'} | ${{ map: 'ae', insDel: 75 }} | ${0}
${'apple'} | ${''} | ${{ map: 'ae', insDel: 75 }} | ${450}
${'apple'} | ${''} | ${{ map: 'ae|(ap)', insDel: 75 }} | ${350}
${'apple'} | ${''} | ${{ map: '(ap)', insDel: 1 }} | ${301}
${'apple'} | ${'apples'} | ${{ map: '(les)(le)', replace: 50 }} | ${50}
${'apple'} | ${'maple'} | ${{ map: '(pp)p', replace: 50 }} | ${150}
${'grapple'} | ${'maples'} | ${{ map: '(pp)p', replace: 50 }} | ${350}
${'bite'} | ${'bate'} | ${{ map: 'aei', replace: 25 }} | ${25}
${'receive'} | ${'recieve' /* cspell:ignore recieve */} | ${{ map: 'ei', swap: 25 }} | ${25}
${'airplane'} | ${'aeroplane'} | ${{ map: '(ai)(ae)', replace: 25 }} | ${125}
${'airplane'} | ${'aeroplane'} | ${{ map: '(air)(aero)|aeiou', replace: 25 }} | ${25}
${'airplane'} | ${'aeroplane'} | ${{ map: 'aeiou', replace: 25 }} | ${125}
${'plain'} | ${'plane'} | ${{ map: '(ane)(ain)', replace: 100 }} | ${100}
`('distanceAStar vs Levenshtein "$wordA" "$wordB" $map', ({ wordA, wordB, map, expected }) => {
const trie = map ? buildWeightedMapTrie([map]) : buildWeightedMapTrie([]);
expect(distanceAStarWeighted(wordA, wordB, trie)).toBe(expected);
expect(distanceAStarWeighted(wordB, wordA, trie)).toBe(expected);
});
});
168 changes: 168 additions & 0 deletions packages/cspell-trie-lib/src/lib/suggestions/distanceAStarWeighted.ts
@@ -0,0 +1,168 @@
import { PairingHeap } from '../utils/PairingHeap';
import { WeightedMapTrie, WeightedRepMapTrie } from './weightedMaps';

/**
* Calculate the edit distance between two words using an A* algorithm.
*
* Using basic weights, this algorithm has the same results as the Damerau-Levenshtein algorithm.
*/
export function distanceAStarWeighted(a: string, b: string, map: WeightedMapTrie): number {
const aN = a.length;
const bN = b.length;
const cost = 100;

const candidates = new PairingHeap(compare);

candidates.add({ ai: 0, bi: 0, c: 0 });

function opSub(n: Node) {
const { ai, bi, c } = n;
if (ai < aN && bi < bN) {
const cc = a[ai] === b[bi] ? c : c + cost;
candidates.add({ ai: ai + 1, bi: bi + 1, c: cc });
}
}

function opIns(n: Node) {
const { ai, bi, c } = n;
if (bi < bN) {
candidates.add({ ai: ai, bi: bi + 1, c: c + cost });
}
}

function opDel(n: Node) {
const { ai, bi, c } = n;
if (ai < aN) {
candidates.add({ ai: ai + 1, bi: bi, c: c + cost });
}
}

function opSwap(n: Node) {
const { ai, bi, c } = n;
if (a[ai] === b[bi + 1] && a[ai + 1] === b[bi]) {
candidates.add({ ai: ai + 2, bi: bi + 2, c: c + cost });
}
}

function opMap(n: Node) {
const { ai, bi, c } = n;

function ins(ai: number, bi: number, m: WeightedMapTrie | undefined) {
if (bi >= bN || !m) return;
const n = m[b[bi]];
if (!n) return;
const cost = n.insDel;
++bi;
if (cost !== undefined) {
candidates.add({ ai, bi, c: c + cost });
}
ins(ai, bi, n.t);
}

function del(ai: number, bi: number, m: WeightedMapTrie | undefined) {
if (ai >= aN || !m) return;
const n = m[a[ai]];
if (!n) return;
++ai;
const cost = n.insDel;
if (cost !== undefined) {
candidates.add({ ai, bi, c: c + cost });
}
del(ai, bi, n.t);
}

function repApply(ai: number, bi: number, m: WeightedRepMapTrie | undefined) {
if (!m || bi >= bN) return;
const char = b[bi];
const n = m[char];
if (!n) return;
++bi;
const cost = n.rep;
if (cost !== undefined) {
candidates.add({ ai, bi, c: c + cost });
}
repApply(ai, bi, n.r);
}

function rep(ai: number, bi: number, m: WeightedMapTrie | undefined) {
if (!m || ai >= aN || bi >= bN) return;
const n = m[a[ai]];
if (!n) return;
++ai;
repApply(ai, bi, n.r);
rep(ai, bi, n.t);
}

function swap(ai: number, bi: number, m: WeightedMapTrie | undefined) {
if (!m || ai >= aN || bi >= bN) return;

function apply(mid: number, right: number, cost: number | undefined) {
if (cost === undefined) return;
const swap = a.slice(mid, right) + a.slice(ai, mid);
const len = swap.length;

const subB = b.slice(bi, bi + len);
if (swap === subB) {
candidates.add({ ai: ai + len, bi: bi + len, c: cost });
}
}

function right(aim: number, ail: number, m: WeightedRepMapTrie | undefined) {
if (!m || ail >= aN) return;
const n = m[a[ail]];
if (!n) return;
++ail;
apply(aim, ail, n.swap);
right(aim, ail, n.r);
}

function left(aim: number, m: WeightedMapTrie | undefined) {
if (!m || aim >= aN) return;
const n = m[a[aim]];
if (!n) return;
++aim;
right(aim, aim, n.r);
left(aim, n.t);
}

left(ai, m);
}

ins(ai, bi, map);
del(ai, bi, map);
rep(ai, bi, map);
swap(ai, bi, map);
}

let best: Node | undefined;
// const bc2 = 2 * bc;
while ((best = candidates.dequeue())) {
if (best.ai === aN && best.bi === bN) break;

opSwap(best);
opIns(best);
opDel(best);
opMap(best);
opSub(best);
}

// istanbul ignore else
return best ? best.c : -1;
}

interface Pos {
/** the offset in string `a` */
ai: number;
/** the offset in string `b` */
bi: number;
}

interface Node extends Pos {
/** the current cost */
c: number;
}

function compare(a: Node, b: Node): number {
// Choose lowest cost or farthest Manhattan distance.
return a.c - b.c || b.ai + b.bi - a.ai - a.bi;
}
23 changes: 23 additions & 0 deletions packages/cspell-trie-lib/src/lib/suggestions/levenshtein.test.ts
@@ -0,0 +1,23 @@
import { levenshteinDistance } from './levenshtein';

describe('levenshtein', () => {
test.each`
left | right | expected
${'abc'} | ${'abc'} | ${0}
${'abc'} | ${'ab'} | ${1}
${'abc'} | ${''} | ${3}
${'kitten'} | ${'sitting'} | ${3}
${'Saturday'} | ${'Sunday'} | ${3}
${'ab'} | ${'ba'} | ${1}
${'aba'} | ${'bab'} | ${2}
${'abab'} | ${'baba'} | ${2}
${'abab'} | ${'ababa'} | ${1}
${'appear'} | ${'apple'} | ${3}
${'appease'} | ${'apple'} | ${3}
`('levenshteinDistance "$left" vs "$right"', ({ left, right, expected }) => {
expect(levenshteinDistance(left, right)).toBe(expected);
expect(levenshteinDistance(right, left)).toBe(expected);
});
});

// cspell:ignore ababa

0 comments on commit 3a4f9db

Please sign in to comment.