Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Structured Metadata Identity #471

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions importlib_metadata/__init__.py
Expand Up @@ -28,6 +28,7 @@
from ._functools import method_cache, pass_none
from ._itertools import always_iterable, unique_everseen
from ._meta import PackageMetadata, SimplePath
from ._adapters import Ident

from contextlib import suppress
from importlib import import_module
Expand All @@ -38,6 +39,7 @@
__all__ = [
'Distribution',
'DistributionFinder',
'Ident',
'PackageMetadata',
'PackageNotFoundError',
'distribution',
Expand Down
255 changes: 255 additions & 0 deletions importlib_metadata/_adapters.py
Expand Up @@ -3,6 +3,9 @@
import re
import textwrap
import email.message
import dataclasses

from typing import Optional

from ._text import FoldedCase
from ._compat import pypy_partial
Expand All @@ -16,6 +19,232 @@
stacklevel=pypy_partial(2),
)

# The formatting of the identity fields ("Author", "Maintainer", "Author-email"
# and "Maintainer-email") in the core metadata specification and related
# 'pyproject.toml' specification is inspired by RFC5233 but not precisely
# defined. In practice conflicting definitions are used by many packages and
# even examples from the specification. For a permissive parser the key
# takeaway from RFC5233 is that special characters such as "," and "@" must be
# quoted when used as text.


def _entries_findall(string):
"""
Return a list of entries given an RFC5233-inspired string. Entries are
separated by ", " and contents of quoted strings are ignored. Each
entry will be a non-empty string.

>>> _entries_findall('a, b , c, "d, e, f"')
['a', 'b ', ' c', '"d, e, f"']

>>> _entries_findall('a')
['a']

>>> _entries_findall('')
[]

>>> _entries_findall(", ")
[]
"""

# Split an RFC5233-ish list:
# 1. Require a list separator, or beginning-of-string.
# 2. Alt 1: match single or double quotes and handle escape characters.
# 3. Alt 2: match anything except ',' followed by a space. If quote
# characters are unbalanced, they will be matched here.
# 4. Match the alternatives at least once, in any order...
# 5. ... and capture them.
# Result:
# group 1 (list entry): None or non-empty string.
_entries = re.compile(
r"""
(?: (?<=,\ ) | (?<=^) ) # 1
( (?: (["']) (?:(?!\2|\\).|\\.)* \2 # 2
| (?!,\ ). # 3
)+ # 4
) # 5
""",
re.VERBOSE,
)

return [entry[0] for entry in _entries.findall(string)]


def _name_email_split(string):
"""
Split an RFC5233-inspired entry into a name and email address tuple. Each
component will be either None or a non-empty string. Split the form "name
local@domain" on the first unquoted "@" such that:

* local may not be empty and may not contain any unquoted spaces
* domain may not be empty
* spaces between name and address are consumed
* space between name and address is optional if name ends in "@"
* first opening "<" of local is consumed only if local remains non-empty
* last closing ">" of domain is consumed only if domain remains non-empty

>>> _name_email_split("name local@domain")
('name', 'local@domain')

>>> _name_email_split('@"unlocal@undomain" @ <loc"al@dom\\'ain')
('@"unlocal@undomain" @', 'loc"al@dom\\'ain')

>>> _name_email_split('@@ local@domain')
('@@', 'local@domain')

>>> _name_email_split('@nameonly@')
('@nameonly@', None)

>>> _name_email_split('@domain@ ')
('@', 'domain@ ')

>>> _name_email_split(' domain@only')
(None, 'domain@only')

>>> _name_email_split(' ')
(' ', None)

>>> _name_email_split('')
(None, None)
"""

# Split an RFC5233-inspired name-address entry:
# 01. Start at the beginning.
# 02. Capture at least one name component, but optionally so the result
# will be 'None' rather than an empty string.
# 03. Stop matching against name components if the lookahead matches an
# address. An address can be preceded by spaces, which are optional if
# the name is missing.
# 04. Simulate a possessive quantifier for Python < 3.11 given the
# equivalence between "(...)++" and "(?=( (...)+ ))\1". The contained
# alternatives are not exclusive and the possessive quantifier prevents
# the second alternative from stealing quoted components during
# backtracking.
# 05. Alt 1.1: Match single-quoted or double-quoted components and handle
# escape characters.
# 06. Alt 1.2: Match any character except the local component delimiters
# " " or "@". If quote characters are unbalanced, they will be matched
# here.
# 07. Match the alternatives at least once - the local part of the address
# cannot be empty.
# 08. (See 04)
# 09. Match "@" followed by something - the domain cannot be empty either.
# 10. (See 03)
# 11. Alt 2.1: Match a quoted component...
# 12. Alt 2.2: ... or match a single character.
# ...
# 14. (See 02)
# 15. (See 02)
# 16. If the name portion is missing or ends with an "@", there may or may
# not be whitespace before the address. The opening angle bracket is
# always optional.
# ...
# 20. Match everything after "@" with a non-greedy quantifier to allow for
# the optional closing angle bracket.
# 21. Allow for no address component.
# 22. Match the optional closing angle bracket.
# 23. Finish at the end.
# Summary:
# ^ ( ( not: space* (quote | not:space-or-at)++ @ anything
# quote | anything
# )+
# )?
# space* <? ( (quote | not:space-or-at)+ @ anything+? )? >? $
# Result:
# group 1 (name): None or non-empty string.
# group 5 (email): None or non-empty string.
_name_email = re.compile(
r"""
^ # 01
( (?: # 02
(?! \ * # 03
(?=( # 04
(?: (["']) (?:(?!\3|\\).|\\.)* \3 # 05
| [^ @] # 06
)+ # 07
))\2 # 08
@ . # 09
) # 10
(?: (["']) (?:(?!\4|\\).|\\.)* \4 # 11
| . # 12
) # 13
)+ # 14
)? # 15
\ * <? # 16
( (?: (["']) (?:(?!\6|\\).|\\.)* \6 # 17
| [^ @] # 18
)+ # 19
@ .+? # 20
)? # 21
>? # 22
$ # 23
""",
re.VERBOSE,
)

# Equivalent, simpler, version using possessive quantifiers, for
# Python >= 3.11.
# _name_email = re.compile(
# r"""
# ^ ( (?: (?! \ *
# (?: (["']) (?:(?!\2|\\).|\\.)* \2
# | [^ @]
# )++
# @ .
# )
# (?: (["']) (?:(?!\3|\\).|\\.)* \3
# | .
# )
# )+
# )?
# \ * <?
# ( (?: (["']) (?:(?!\5|\\).|\\.)* \5
# | [^ @]
# )+
# @ .+?
# )?
# >?
# $
# """,
# re.VERBOSE,
# )

return _name_email.match(string).groups()[::4]


def _uniq(values):
"""
Return a list omitting duplicate values.

>>> _uniq([1, 2, 1, 2, 3, 1, 4])
[1, 2, 3, 4]

>>> _uniq(())
[]
"""
unique = set()
result = []
for value in values:
if value in unique:
continue
unique.add(value)
result.append(value)
return result


@dataclasses.dataclass(eq=True, frozen=True)
class Ident:
"""
A container for identity attributes, used by the author or
maintainer fields.
"""

name: Optional[str]
email: Optional[str]

def __iter__(self):
return (getattr(self, field.name) for field in dataclasses.fields(self))


class Message(email.message.Message):
multiple_use_keys = set(
Expand Down Expand Up @@ -88,3 +317,29 @@ def transform(key):
return tk, value

return dict(map(transform, map(FoldedCase, self)))

def _parse_idents(self, string):
entries = (_name_email_split(entry) for entry in _entries_findall(string))
return _uniq(Ident(*entry) for entry in entries if entry != (None, None))

def _parse_names(self, string):
return _uniq(Ident(entry, None) for entry in _entries_findall(string))

def _parse_names_idents(self, names_field, idents_field):
names = self._parse_names(self.get(names_field, ""))
idents = self._parse_idents(self.get(idents_field, ""))
return _uniq((*names, *idents))

@property
def authors(self):
"""
Minimal parsing for "Author" and "Author-email" fields.
"""
return self._parse_names_idents("Author", "Author-email")

@property
def maintainers(self):
"""
Minimal parsing for "Maintainer" and "Maintainer-email" fields.
"""
return self._parse_names_idents("Maintainer", "Maintainer-email")
14 changes: 14 additions & 0 deletions importlib_metadata/_meta.py
Expand Up @@ -4,6 +4,8 @@
from typing import Protocol
from typing import Any, Dict, Iterator, List, Optional, TypeVar, Union, overload

from ._adapters import Ident


_T = TypeVar("_T")

Expand Down Expand Up @@ -43,6 +45,18 @@ def json(self) -> Dict[str, Union[str, List[str]]]:
A JSON-compatible form of the metadata.
"""

@property
def authors(self) -> List[Ident]:
"""
Minimal parsing for "Author" and "Author-email" fields.
"""

@property
def maintainers(self) -> List[Ident]:
"""
Minimal parsing for "Maintainer" and "Maintainer-email" fields.
"""


class SimplePath(Protocol):
"""
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Expand Up @@ -21,6 +21,7 @@ install_requires =

[options.extras_require]
testing =
hypothesis >= 6.85.0
# upstream
pytest >= 6
pytest-checkdocs >= 2.4
Expand Down