Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing some performance bottlenecks #183

Merged
merged 14 commits into from
May 3, 2022
25 changes: 11 additions & 14 deletions charset_normalizer/cd.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,10 @@ def characters_popularity_compare(
raise ValueError("{} not available".format(language))

character_approved_count = 0 # type: int

FREQUENCIES_language_set = set(FREQUENCIES[language])

for character in ordered_characters:
if character not in FREQUENCIES[language]:
if character not in FREQUENCIES_language_set:
continue

characters_before_source = FREQUENCIES[language][
Expand All @@ -186,24 +187,20 @@ def characters_popularity_compare(
characters_after_source = FREQUENCIES[language][
FREQUENCIES[language].index(character) :
] # type: List[str]

characters_before = ordered_characters[
0 : ordered_characters.index(character)
] # type: List[str]
characters_after = ordered_characters[
ordered_characters.index(character) :
] # type: List[str]

before_match_count = [
e in characters_before for e in characters_before_source
].count(
True
) # type: int
after_match_count = [
e in characters_after for e in characters_after_source
].count(
True
) # type: int

before_match_count = len(
set(characters_before) & set(characters_before_source)
) # type: int

after_match_count = len(
set(characters_after) & set(characters_after_source)
) # type: int

if len(characters_before_source) == 0 and before_match_count <= 4:
character_approved_count += 1
Expand Down
13 changes: 5 additions & 8 deletions charset_normalizer/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
is_separator,
is_symbol,
is_thai,
is_unprintable,
remove_accent,
unicode_range,
)
Expand Down Expand Up @@ -137,13 +138,9 @@ def __init__(self) -> None:

def eligible(self, character: str) -> bool:
return True

def feed(self, character: str) -> None:
if (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
):
if is_unprintable(character):
self._unprintable_count += 1
self._character_count += 1

Expand All @@ -167,7 +164,7 @@ def __init__(self) -> None:

def eligible(self, character: str) -> bool:
return character.isalpha() and is_latin(character)

def feed(self, character: str) -> None:
self._character_count += 1
if (
Expand Down Expand Up @@ -445,7 +442,7 @@ def ratio(self) -> float:

return self._successive_upper_lower_count_final / self._character_count


@lru_cache(maxsize=1024)
Ousret marked this conversation as resolved.
Show resolved Hide resolved
def is_suspiciously_successive_range(
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
) -> bool:
Expand Down
11 changes: 11 additions & 0 deletions charset_normalizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def is_latin(character: str) -> bool:
return "LATIN" in description


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
Ousret marked this conversation as resolved.
Show resolved Hide resolved
def is_ascii(character: str) -> bool:
try:
character.encode("ascii")
Expand Down Expand Up @@ -197,6 +198,16 @@ def is_unicode_range_secondary(range_name: str) -> bool:
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_unprintable(character: str) -> bool:
if (
deedy5 marked this conversation as resolved.
Show resolved Hide resolved
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
):
return True
return False

def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
"""
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
Expand Down
14 changes: 14 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

from setuptools import find_packages, setup

from mypyc.build import mypycify


def get_version():
with open('charset_normalizer/version.py') as version_file:
Expand Down Expand Up @@ -51,6 +53,18 @@ def get_version():
packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]),
install_requires=REQUIRED,
extras_require=EXTRAS,
ext_modules=mypycify([
'charset_normalizer/__init__.py',
'charset_normalizer/api.py',
'charset_normalizer/constant.py',
'charset_normalizer/cd.py',
'charset_normalizer/md.py',
'charset_normalizer/models.py',
'charset_normalizer/utils.py',
'charset_normalizer/assets/__init__.py',
'charset_normalizer/cli/normalizer.py'

deedy5 marked this conversation as resolved.
Show resolved Hide resolved
]),
include_package_data=True,
package_data={"charset_normalizer": ["py.typed"]},
license='MIT',
Expand Down