improve performance of find_similar_names

on most platforms this improves the performance by 30-40x by using a highly optimized C extension. On unsupported platforms falls back to a pure Python implementation, which still improves performance by 2x.
python-poetry · Jul 19, 2022 · e96c529 · e96c529
1 parent fa8d9a7
commit e96c529
Show file tree

Hide file tree

Showing 3 changed files with 272 additions and 50 deletions.
diff --git a/cleo/_utils.py b/cleo/_utils.py
@@ -5,7 +5,7 @@
 from html.parser import HTMLParser
 from typing import Any
 
-from pylev import levenshtein
+from rapidfuzz.distance import Levenshtein
 
 
 class TagStripper(HTMLParser):
@@ -54,11 +54,10 @@ def find_similar_names(name: str, names: list[str]) -> list[str]:
     """
     threshold = 1e3
     distance_by_name = {}
-    suggested_names = []
 
     for actual_name in names:
         # Get Levenshtein distance between the input and each command name
-        distance = levenshtein(name, actual_name)
+        distance = Levenshtein.distance(name, actual_name)
 
         is_similar = distance <= len(name) / 3
         is_sub_string = actual_name.find(name) != -1
@@ -75,11 +74,7 @@ def find_similar_names(name: str, names: list[str]) -> list[str]:
     }
 
     # Display results with shortest distance first
-    for k, _v in sorted(distance_by_name.items(), key=lambda i: (i[1][0], i[1][1])):
-        if k not in suggested_names:
-            suggested_names.append(k)
-
-    return suggested_names
+    return sorted(distance_by_name, key=distance_by_name.get)
 
 
 _TIME_FORMATS = [