Merge pull request #117 from David-Desmaisons/master

Improving fuzzywuzzy performance
seatgeek · Jun 30, 2016 · 56bc3d0 · 56bc3d0
2 parents b90f6c9 + 7311b37
commit 56bc3d0
Show file tree

Hide file tree

Showing 2 changed files with 102 additions and 46 deletions.
diff --git a/fuzzywuzzy/fuzz.py b/fuzzywuzzy/fuzz.py
@@ -94,10 +94,11 @@ def partial_ratio(s1, s2):
 # Advanced Scoring Functions #
 ##############################
 
-def _process_and_sort(s, force_ascii):
+def _process_and_sort(s, force_ascii, full_process=True):
     """Return a cleaned string with token sorted."""
     # pull tokens
-    tokens = utils.full_process(s, force_ascii=force_ascii).split()
+    ts = utils.full_process(s, force_ascii=force_ascii) if full_process else s
+    tokens = ts.split()
 
     # sort tokens and join
     sorted_string = u" ".join(sorted(tokens))
@@ -109,50 +110,50 @@ def _process_and_sort(s, force_ascii):
 #   sort those tokens and take ratio of resulting joined strings
 #   controls for unordered string elements
 @utils.check_for_none
-def _token_sort(s1, s2, partial=True, force_ascii=True):
-    sorted1 = _process_and_sort(s1, force_ascii)
-    sorted2 = _process_and_sort(s2, force_ascii)
+def _token_sort(s1, s2, partial=True, force_ascii=True, full_process=True):
+    sorted1 = _process_and_sort(s1, force_ascii, full_process=full_process)
+    sorted2 = _process_and_sort(s2, force_ascii, full_process=full_process)
 
     if partial:
         return partial_ratio(sorted1, sorted2)
     else:
         return ratio(sorted1, sorted2)
 
 
-def token_sort_ratio(s1, s2, force_ascii=True):
+def token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
     """Return a measure of the sequences' similarity between 0 and 100
     but sorting the token before comparing.
     """
-    return _token_sort(s1, s2, partial=False, force_ascii=force_ascii)
+    return _token_sort(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process)
 
 
-def partial_token_sort_ratio(s1, s2, force_ascii=True):
+def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
     """Return the ratio of the most similar substring as a number between
     0 and 100 but sorting the token before comparing.
     """
-    return _token_sort(s1, s2, partial=True, force_ascii=force_ascii)
+    return _token_sort(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process)
 
 
 @utils.check_for_none
-def _token_set(s1, s2, partial=True, force_ascii=True):
+def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True):
     """Find all alphanumeric tokens in each string...
         - treat them as a set
         - construct two strings of the form:
             <sorted_intersection><sorted_remainder>
         - take ratios of those two strings
         - controls for unordered partial matches"""
 
-    p1 = utils.full_process(s1, force_ascii=force_ascii)
-    p2 = utils.full_process(s2, force_ascii=force_ascii)
+    p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1
+    p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2
 
     if not utils.validate_string(p1):
         return 0
     if not utils.validate_string(p2):
         return 0
 
     # pull tokens
-    tokens1 = set(utils.full_process(p1).split())
-    tokens2 = set(utils.full_process(p2).split())
+    tokens1 = set(p1.split())
+    tokens2 = set(p2.split())
 
     intersection = tokens1.intersection(tokens2)
     diff1to2 = tokens1.difference(tokens2)
@@ -183,12 +184,12 @@ def _token_set(s1, s2, partial=True, force_ascii=True):
     return max(pairwise)
 
 
-def token_set_ratio(s1, s2, force_ascii=True):
-    return _token_set(s1, s2, partial=False, force_ascii=force_ascii)
+def token_set_ratio(s1, s2, force_ascii=True, full_process=True):
+    return _token_set(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process)
 
 
-def partial_token_set_ratio(s1, s2, force_ascii=True):
-    return _token_set(s1, s2, partial=True, force_ascii=force_ascii)
+def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True):
+    return _token_set(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process)
 
 
 ###################
@@ -245,15 +246,15 @@ def WRatio(s1, s2, force_ascii=True):
 
     if try_partial:
         partial = partial_ratio(p1, p2) * partial_scale
-        ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) \
+        ptsor = partial_token_sort_ratio(p1, p2, full_process=False) \
             * unbase_scale * partial_scale
-        ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) \
+        ptser = partial_token_set_ratio(p1, p2, full_process=False) \
             * unbase_scale * partial_scale
 
         return utils.intr(max(base, partial, ptsor, ptser))
     else:
-        tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
-        tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
+        tsor = token_sort_ratio(p1, p2, full_process=False) * unbase_scale
+        tser = token_set_ratio(p1, p2, full_process=False) * unbase_scale
 
         return utils.intr(max(base, tsor, tser))
 

diff --git a/fuzzywuzzy/process.py b/fuzzywuzzy/process.py
@@ -24,17 +24,17 @@
 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 """
-import itertools
 
 from . import fuzz
 from . import utils
+import heapq
 
 
-def extract(query, choices, processor=None, scorer=None, limit=5):
+def extractWithoutOrder(query, choices, processor=None, scorer=None, score_cutoff=0):
     """Select the best match in a list or dictionary of choices.
 
     Find best matches in a list or dictionary of choices, return a
-    list of tuples containing the match and it's score. If a dictionary
+    generator of tuples containing the match and it's score. If a dictionary
     is used, also returns the key for each match.
 
     Arguments:
@@ -58,11 +58,11 @@ def extract(query, choices, processor=None, scorer=None, limit=5):
 
             By default, fuzz.WRatio() is used and expects both query and
             choice to be strings.
-        limit: Optional maximum for the number of elements returned. Defaults
-            to 5.
+        score_cutoff: Optional argument for score threshold. No matches with
+            a score less than this number will be returned. Defaults to 0.
 
     Returns:
-        List of tuples containing the match and its score.
+        Generator of tuples containing the match and its score.
 
         If a list is used for choices, then the result will be 2-tuples.
         If a dictionary is used, then the result will be 3-tuples containing
@@ -74,44 +74,96 @@ def extract(query, choices, processor=None, scorer=None, limit=5):
 
         may return
 
-        [('train', 22, 'bard'), ('man', 0, 'dog')]
+        ('train', 22, 'bard'), ('man', 0, 'dog')
     """
+    def no_process(x):
+        return x
 
     if choices is None:
-        return []
+        raise StopIteration
 
     # Catch generators without lengths
     try:
         if len(choices) == 0:
-            return []
+            raise StopIteration
     except TypeError:
         pass
 
-    # default, turn whatever the choice is into a workable string
-    if not processor:
-        processor = utils.full_process
-
     # default: wratio
     if not scorer:
         scorer = fuzz.WRatio
+        # fuzz.WRatio already process string so no need extra step
+        if not processor:
+            processor = no_process
 
-    sl = []
+    # default, turn whatever the choice is into a workable string
+    if not processor:
+        processor = utils.full_process
 
     try:
         # See if choices is a dictionary-like object.
         for key, choice in choices.items():
             processed = processor(choice)
             score = scorer(query, processed)
-            sl.append((choice, score, key))
+            if score >= score_cutoff:
+                yield (choice, score, key)
     except AttributeError:
         # It's a list; just iterate over it.
         for choice in choices:
             processed = processor(choice)
             score = scorer(query, processed)
-            sl.append((choice, score))
+            if score >= score_cutoff:
+                yield (choice, score)
+
 
-    sl.sort(key=lambda i: i[1], reverse=True)
-    return sl[:limit]
+def extract(query, choices, processor=None, scorer=None, limit=5):
+    """Select the best match in a list or dictionary of choices.
+
+    Find best matches in a list or dictionary of choices, return a
+    list of tuples containing the match and it's score. If a dictionary
+    is used, also returns the key for each match.
+
+    Arguments:
+        query: An object representing the thing we want to find.
+        choices: An iterable or dictionary-like object containing choices
+            to be matched against the query. Dictionary arguments of
+            {key: value} pairs will attempt to match the query against
+            each value.
+        processor: Optional function of the form f(a) -> b, where a is an
+            individual choice and b is the choice to be used in matching.
+
+            This can be used to match against, say, the first element of
+            a list:
+
+            lambda x: x[0]
+
+            Defaults to fuzzywuzzy.utils.full_process().
+        scorer: Optional function for scoring matches between the query and
+            an individual processed choice. This should be a function
+            of the form f(query, choice) -> int.
+            By default, fuzz.WRatio() is used and expects both query and
+            choice to be strings.
+        limit: Optional maximum for the number of elements returned. Defaults
+            to 5.
+
+    Returns:
+        List of tuples containing the match and its score.
+
+        If a list is used for choices, then the result will be 2-tuples.
+        If a dictionary is used, then the result will be 3-tuples containing
+        he key for each match.
+
+        For example, searching for 'bird' in the dictionary
+
+        {'bard': 'train', 'dog': 'man'}
+
+        may return
+
+        [('train', 22, 'bard'), ('man', 0, 'dog')]
+    """
+    sl = extractWithoutOrder(query, choices, processor, scorer)
+    return heapq.nlargest(limit, sl, key=lambda i: i[1]) if limit is not None else \
+        sorted(sl, key=lambda i: i[1], reverse=True)
 
 
 def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, limit=5):
@@ -133,8 +185,10 @@ def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, li
 
     Returns: A a list of (match, score) tuples.
     """
-    best_list = extract(query, choices, processor, scorer, limit)
-    return list(itertools.takewhile(lambda x: x[1] >= score_cutoff, best_list))
+
+    best_list = extractWithoutOrder(query, choices, processor, scorer, score_cutoff)
+    return heapq.nlargest(limit, best_list, key=lambda i: i[1]) if limit is not None else \
+        sorted(best_list, key=lambda i: i[1], reverse=True)
 
 
 def extractOne(query, choices, processor=None, scorer=None, score_cutoff=0):
@@ -158,10 +212,11 @@ def extractOne(query, choices, processor=None, scorer=None, score_cutoff=0):
         A tuple containing a single match and its score, if a match
         was found that was above score_cutoff. Otherwise, returns None.
     """
-    best_list = extract(query, choices, processor, scorer, limit=1)
-    if len(best_list) > 0 and best_list[0][1] >= score_cutoff:
-        return best_list[0]
-    return None
+    best_list = extractWithoutOrder(query, choices, processor, scorer, score_cutoff)
+    try:
+        return max(best_list, key=lambda i: i[1])
+    except ValueError:
+        return None
 
 
 def dedupe(contains_dupes, threshold=70, scorer=fuzz.token_set_ratio):