Skip to content

Commit

Permalink
improve performance of find_similar_names
Browse files Browse the repository at this point in the history
on most platforms this improves the performance by 30-40x
by using a highly optimized C extension. On unsupported platforms
falls back to a pure Python implementation, which still improves
performance by 2x.
  • Loading branch information
maxbachmann committed Jul 19, 2022
1 parent fa8d9a7 commit e96c529
Show file tree
Hide file tree
Showing 3 changed files with 272 additions and 50 deletions.
11 changes: 3 additions & 8 deletions cleo/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from html.parser import HTMLParser
from typing import Any

from pylev import levenshtein
from rapidfuzz.distance import Levenshtein


class TagStripper(HTMLParser):
Expand Down Expand Up @@ -54,11 +54,10 @@ def find_similar_names(name: str, names: list[str]) -> list[str]:
"""
threshold = 1e3
distance_by_name = {}
suggested_names = []

for actual_name in names:
# Get Levenshtein distance between the input and each command name
distance = levenshtein(name, actual_name)
distance = Levenshtein.distance(name, actual_name)

is_similar = distance <= len(name) / 3
is_sub_string = actual_name.find(name) != -1
Expand All @@ -75,11 +74,7 @@ def find_similar_names(name: str, names: list[str]) -> list[str]:
}

# Display results with shortest distance first
for k, _v in sorted(distance_by_name.items(), key=lambda i: (i[1][0], i[1][1])):
if k not in suggested_names:
suggested_names.append(k)

return suggested_names
return sorted(distance_by_name, key=distance_by_name.get)


_TIME_FORMATS = [
Expand Down

0 comments on commit e96c529

Please sign in to comment.