Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

_set_token_ratio now keeps tokenization. #300

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 11 additions & 1 deletion fuzzywuzzy/StringMatcher.py
Expand Up @@ -8,7 +8,7 @@
License available here: https://github.com/miohtama/python-Levenshtein/blob/master/COPYING
"""

from Levenshtein import *
from Levenshtein._levenshtein import *
from warnings import warn


Expand Down Expand Up @@ -64,6 +64,16 @@ def ratio(self):
self._ratio = ratio(self._str1, self._str2)
return self._ratio

def setratio(self):
if(not hasattr(self, '_setratio')):
self._setratio = setratio(self._str1, self._str2)
return self._setratio

def seqratio(self):
if(not hasattr(self, '_seqratio')):
self._seqratio = seqratio(self._str1, self._str2)
return self._seqratio

def quick_ratio(self):
# This is usually quick enough :o)
if not self._ratio:
Expand Down
48 changes: 38 additions & 10 deletions fuzzywuzzy/fuzz.py
Expand Up @@ -8,7 +8,8 @@
from .StringMatcher import StringMatcher as SequenceMatcher
except ImportError:
if platform.python_implementation() != "PyPy":
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
warnings.warn(
'Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
from difflib import SequenceMatcher

from . import utils
Expand All @@ -28,6 +29,26 @@ def ratio(s1, s2):
return utils.intr(100 * m.ratio())


@utils.check_for_none
@utils.check_for_equivalence
@utils.check_empty_string
def setratio(s1, s2):
s1, s2 = utils.make_type_consistent(s1, s2)

m = SequenceMatcher(None, s1, s2)
return utils.intr(100 * m.setratio())


@utils.check_for_none
@utils.check_for_equivalence
@utils.check_empty_string
def seqratio(s1, s2):
s1, s2 = utils.make_type_consistent(s1, s2)

m = SequenceMatcher(None, s1, s2)
return utils.intr(100 * m.seqratio())


@utils.check_for_none
@utils.check_for_equivalence
@utils.check_empty_string
Expand Down Expand Up @@ -124,8 +145,10 @@ def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True):
if not full_process and s1 == s2:
return 100

p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1
p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2
p1 = utils.full_process(
s1, force_ascii=force_ascii) if full_process else s1
p2 = utils.full_process(
s2, force_ascii=force_ascii) if full_process else s2

if not utils.validate_string(p1):
return 0
Expand All @@ -140,23 +163,28 @@ def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True):
diff1to2 = tokens1.difference(tokens2)
diff2to1 = tokens2.difference(tokens1)

sorted_sect = " ".join(sorted(intersection))
sorted_1to2 = " ".join(sorted(diff1to2))
sorted_2to1 = " ".join(sorted(diff2to1))
delimiter = "+++"
sorted_sect = delimiter.join(sorted(intersection))
sorted_1to2 = delimiter.join(sorted(diff1to2))
sorted_2to1 = delimiter.join(sorted(diff2to1))

combined_1to2 = sorted_sect + " " + sorted_1to2
combined_2to1 = sorted_sect + " " + sorted_2to1
combined_1to2 = sorted_sect + delimiter + sorted_1to2
combined_2to1 = sorted_sect + delimiter + sorted_2to1

# strip
sorted_sect = sorted_sect.strip()
combined_1to2 = combined_1to2.strip()
combined_2to1 = combined_2to1.strip()

# replace
sorted_sect = sorted_sect.replace(delimiter, " ")
combined_1to2 = combined_1to2.replace(delimiter, " ")
combined_2to1 = combined_2to1.replace(delimiter, " ")

if partial:
ratio_func = partial_ratio
else:
ratio_func = ratio

ratio_func = setratio
pairwise = [
ratio_func(sorted_sect, combined_1to2),
ratio_func(sorted_sect, combined_2to1),
Expand Down
1 change: 1 addition & 0 deletions test_fuzzywuzzy.py
Expand Up @@ -140,6 +140,7 @@ def testTokenSetRatio(self):
self.assertEqual(fuzz.token_set_ratio(self.s9, self.s9a, full_process=True), 100)
self.assertEqual(fuzz.token_set_ratio(self.s9, self.s9a, full_process=False), 100)
self.assertEqual(fuzz.token_set_ratio(self.s10, self.s10a, full_process=False), 50)


def testPartialTokenSetRatio(self):
self.assertEqual(fuzz.partial_token_set_ratio(self.s4, self.s7), 100)
Expand Down