Skip to content

Commit

Permalink
Merge pull request #117 from David-Desmaisons/master
Browse files Browse the repository at this point in the history
Improving fuzzywuzzy performance
  • Loading branch information
josegonzalez committed Jun 30, 2016
2 parents b90f6c9 + 7311b37 commit 56bc3d0
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 46 deletions.
45 changes: 23 additions & 22 deletions fuzzywuzzy/fuzz.py
Expand Up @@ -94,10 +94,11 @@ def partial_ratio(s1, s2):
# Advanced Scoring Functions #
##############################

def _process_and_sort(s, force_ascii):
def _process_and_sort(s, force_ascii, full_process=True):
"""Return a cleaned string with token sorted."""
# pull tokens
tokens = utils.full_process(s, force_ascii=force_ascii).split()
ts = utils.full_process(s, force_ascii=force_ascii) if full_process else s
tokens = ts.split()

# sort tokens and join
sorted_string = u" ".join(sorted(tokens))
Expand All @@ -109,50 +110,50 @@ def _process_and_sort(s, force_ascii):
# sort those tokens and take ratio of resulting joined strings
# controls for unordered string elements
@utils.check_for_none
def _token_sort(s1, s2, partial=True, force_ascii=True):
sorted1 = _process_and_sort(s1, force_ascii)
sorted2 = _process_and_sort(s2, force_ascii)
def _token_sort(s1, s2, partial=True, force_ascii=True, full_process=True):
sorted1 = _process_and_sort(s1, force_ascii, full_process=full_process)
sorted2 = _process_and_sort(s2, force_ascii, full_process=full_process)

if partial:
return partial_ratio(sorted1, sorted2)
else:
return ratio(sorted1, sorted2)


def token_sort_ratio(s1, s2, force_ascii=True):
def token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
"""Return a measure of the sequences' similarity between 0 and 100
but sorting the token before comparing.
"""
return _token_sort(s1, s2, partial=False, force_ascii=force_ascii)
return _token_sort(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process)


def partial_token_sort_ratio(s1, s2, force_ascii=True):
def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
"""Return the ratio of the most similar substring as a number between
0 and 100 but sorting the token before comparing.
"""
return _token_sort(s1, s2, partial=True, force_ascii=force_ascii)
return _token_sort(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process)


@utils.check_for_none
def _token_set(s1, s2, partial=True, force_ascii=True):
def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True):
"""Find all alphanumeric tokens in each string...
- treat them as a set
- construct two strings of the form:
<sorted_intersection><sorted_remainder>
- take ratios of those two strings
- controls for unordered partial matches"""

p1 = utils.full_process(s1, force_ascii=force_ascii)
p2 = utils.full_process(s2, force_ascii=force_ascii)
p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1
p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2

if not utils.validate_string(p1):
return 0
if not utils.validate_string(p2):
return 0

# pull tokens
tokens1 = set(utils.full_process(p1).split())
tokens2 = set(utils.full_process(p2).split())
tokens1 = set(p1.split())
tokens2 = set(p2.split())

intersection = tokens1.intersection(tokens2)
diff1to2 = tokens1.difference(tokens2)
Expand Down Expand Up @@ -183,12 +184,12 @@ def _token_set(s1, s2, partial=True, force_ascii=True):
return max(pairwise)


def token_set_ratio(s1, s2, force_ascii=True):
return _token_set(s1, s2, partial=False, force_ascii=force_ascii)
def token_set_ratio(s1, s2, force_ascii=True, full_process=True):
return _token_set(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process)


def partial_token_set_ratio(s1, s2, force_ascii=True):
return _token_set(s1, s2, partial=True, force_ascii=force_ascii)
def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True):
return _token_set(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process)


###################
Expand Down Expand Up @@ -245,15 +246,15 @@ def WRatio(s1, s2, force_ascii=True):

if try_partial:
partial = partial_ratio(p1, p2) * partial_scale
ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) \
ptsor = partial_token_sort_ratio(p1, p2, full_process=False) \
* unbase_scale * partial_scale
ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) \
ptser = partial_token_set_ratio(p1, p2, full_process=False) \
* unbase_scale * partial_scale

return utils.intr(max(base, partial, ptsor, ptser))
else:
tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
tsor = token_sort_ratio(p1, p2, full_process=False) * unbase_scale
tser = token_set_ratio(p1, p2, full_process=False) * unbase_scale

return utils.intr(max(base, tsor, tser))

Expand Down
103 changes: 79 additions & 24 deletions fuzzywuzzy/process.py
Expand Up @@ -24,17 +24,17 @@
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""
import itertools

from . import fuzz
from . import utils
import heapq


def extract(query, choices, processor=None, scorer=None, limit=5):
def extractWithoutOrder(query, choices, processor=None, scorer=None, score_cutoff=0):
"""Select the best match in a list or dictionary of choices.
Find best matches in a list or dictionary of choices, return a
list of tuples containing the match and it's score. If a dictionary
generator of tuples containing the match and it's score. If a dictionary
is used, also returns the key for each match.
Arguments:
Expand All @@ -58,11 +58,11 @@ def extract(query, choices, processor=None, scorer=None, limit=5):
By default, fuzz.WRatio() is used and expects both query and
choice to be strings.
limit: Optional maximum for the number of elements returned. Defaults
to 5.
score_cutoff: Optional argument for score threshold. No matches with
a score less than this number will be returned. Defaults to 0.
Returns:
List of tuples containing the match and its score.
Generator of tuples containing the match and its score.
If a list is used for choices, then the result will be 2-tuples.
If a dictionary is used, then the result will be 3-tuples containing
Expand All @@ -74,44 +74,96 @@ def extract(query, choices, processor=None, scorer=None, limit=5):
may return
[('train', 22, 'bard'), ('man', 0, 'dog')]
('train', 22, 'bard'), ('man', 0, 'dog')
"""
def no_process(x):
return x

if choices is None:
return []
raise StopIteration

# Catch generators without lengths
try:
if len(choices) == 0:
return []
raise StopIteration
except TypeError:
pass

# default, turn whatever the choice is into a workable string
if not processor:
processor = utils.full_process

# default: wratio
if not scorer:
scorer = fuzz.WRatio
# fuzz.WRatio already process string so no need extra step
if not processor:
processor = no_process

sl = []
# default, turn whatever the choice is into a workable string
if not processor:
processor = utils.full_process

try:
# See if choices is a dictionary-like object.
for key, choice in choices.items():
processed = processor(choice)
score = scorer(query, processed)
sl.append((choice, score, key))
if score >= score_cutoff:
yield (choice, score, key)
except AttributeError:
# It's a list; just iterate over it.
for choice in choices:
processed = processor(choice)
score = scorer(query, processed)
sl.append((choice, score))
if score >= score_cutoff:
yield (choice, score)


sl.sort(key=lambda i: i[1], reverse=True)
return sl[:limit]
def extract(query, choices, processor=None, scorer=None, limit=5):
"""Select the best match in a list or dictionary of choices.
Find best matches in a list or dictionary of choices, return a
list of tuples containing the match and it's score. If a dictionary
is used, also returns the key for each match.
Arguments:
query: An object representing the thing we want to find.
choices: An iterable or dictionary-like object containing choices
to be matched against the query. Dictionary arguments of
{key: value} pairs will attempt to match the query against
each value.
processor: Optional function of the form f(a) -> b, where a is an
individual choice and b is the choice to be used in matching.
This can be used to match against, say, the first element of
a list:
lambda x: x[0]
Defaults to fuzzywuzzy.utils.full_process().
scorer: Optional function for scoring matches between the query and
an individual processed choice. This should be a function
of the form f(query, choice) -> int.
By default, fuzz.WRatio() is used and expects both query and
choice to be strings.
limit: Optional maximum for the number of elements returned. Defaults
to 5.
Returns:
List of tuples containing the match and its score.
If a list is used for choices, then the result will be 2-tuples.
If a dictionary is used, then the result will be 3-tuples containing
he key for each match.
For example, searching for 'bird' in the dictionary
{'bard': 'train', 'dog': 'man'}
may return
[('train', 22, 'bard'), ('man', 0, 'dog')]
"""
sl = extractWithoutOrder(query, choices, processor, scorer)
return heapq.nlargest(limit, sl, key=lambda i: i[1]) if limit is not None else \
sorted(sl, key=lambda i: i[1], reverse=True)


def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, limit=5):
Expand All @@ -133,8 +185,10 @@ def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, li
Returns: A a list of (match, score) tuples.
"""
best_list = extract(query, choices, processor, scorer, limit)
return list(itertools.takewhile(lambda x: x[1] >= score_cutoff, best_list))

best_list = extractWithoutOrder(query, choices, processor, scorer, score_cutoff)
return heapq.nlargest(limit, best_list, key=lambda i: i[1]) if limit is not None else \
sorted(best_list, key=lambda i: i[1], reverse=True)


def extractOne(query, choices, processor=None, scorer=None, score_cutoff=0):
Expand All @@ -158,10 +212,11 @@ def extractOne(query, choices, processor=None, scorer=None, score_cutoff=0):
A tuple containing a single match and its score, if a match
was found that was above score_cutoff. Otherwise, returns None.
"""
best_list = extract(query, choices, processor, scorer, limit=1)
if len(best_list) > 0 and best_list[0][1] >= score_cutoff:
return best_list[0]
return None
best_list = extractWithoutOrder(query, choices, processor, scorer, score_cutoff)
try:
return max(best_list, key=lambda i: i[1])
except ValueError:
return None


def dedupe(contains_dupes, threshold=70, scorer=fuzz.token_set_ratio):
Expand Down

0 comments on commit 56bc3d0

Please sign in to comment.