Move XML Name pattern to epub3

sphinx-doc · Jan 3, 2023 · f4ab9ad · f4ab9ad
1 parent 5eb79c1
commit f4ab9ad
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 35 deletions.
diff --git a/sphinx/builders/epub3.py b/sphinx/builders/epub3.py
@@ -6,6 +6,7 @@
 from __future__ import annotations
 
 import html
+import re
 from os import path
 from typing import Any, NamedTuple
 
@@ -14,7 +15,7 @@
 from sphinx.builders import _epub_base
 from sphinx.config import ENUM, Config
 from sphinx.locale import __
-from sphinx.util import logging, xmlname_checker
+from sphinx.util import logging
 from sphinx.util.fileutil import copy_asset_file
 from sphinx.util.i18n import format_date
 from sphinx.util.osutil import make_filename
@@ -50,6 +51,19 @@ class NavPoint(NamedTuple):
     'xmlns:epub="http://www.idpf.org/2007/ops">'
 )
 
+# https://www.w3.org/TR/REC-xml/#NT-Name
+_xml_name_start_char = (
+    ':|[A-Z]|_|[a-z]|[\u00C0-\u00D6]'
+    '|[\u00D8-\u00F6]|[\u00F8-\u02FF]|[\u0370-\u037D]'
+    '|[\u037F-\u1FFF]|[\u200C-\u200D]|[\u2070-\u218F]'
+    '|[\u2C00-\u2FEF]|[\u3001-\uD7FF]|[\uF900-\uFDCF]'
+    '|[\uFDF0-\uFFFD]|[\U00010000-\U000EFFFF]'
+)
+_xml_name_char = (
+    _xml_name_start_char + r'\-|\.' '|[0-9]|\u00B7|[\u0300-\u036F]|[\u203F-\u2040]'
+)
+_XML_NAME_PATTERN = re.compile(f'({_xml_name_start_char})({_xml_name_char})*')
+
 
 class Epub3Builder(_epub_base.EpubBuilder):
     """
@@ -187,7 +201,7 @@ def validate_config_values(app: Sphinx) -> None:
         logger.warning(__('conf value "epub_language" (or "language") '
                           'should not be empty for EPUB3'))
     # <package> unique-identifier attribute
-    if not xmlname_checker().match(app.config.epub_uid):
+    if not _XML_NAME_PATTERN.match(app.config.epub_uid):
         logger.warning(__('conf value "epub_uid" should be XML NAME for EPUB3'))
     # dc:title
     if not app.config.epub_title:

diff --git a/sphinx/util/__init__.py b/sphinx/util/__init__.py
@@ -371,32 +371,11 @@ def isurl(url: str) -> bool:
     return bool(url) and '://' in url
 
 
-def xmlname_checker() -> re.Pattern:
-    # https://www.w3.org/TR/REC-xml/#NT-Name
-    name_start_chars = [
-        ':', ['A', 'Z'], '_', ['a', 'z'], ['\u00C0', '\u00D6'],
-        ['\u00D8', '\u00F6'], ['\u00F8', '\u02FF'], ['\u0370', '\u037D'],
-        ['\u037F', '\u1FFF'], ['\u200C', '\u200D'], ['\u2070', '\u218F'],
-        ['\u2C00', '\u2FEF'], ['\u3001', '\uD7FF'], ['\uF900', '\uFDCF'],
-        ['\uFDF0', '\uFFFD'], ['\U00010000', '\U000EFFFF']]
-
-    name_chars = [
-        "\\-", "\\.", ['0', '9'], '\u00B7', ['\u0300', '\u036F'],
-        ['\u203F', '\u2040']
-    ]
-
-    def convert(entries: Any, splitter: str = '|') -> str:
-        results = []
-        for entry in entries:
-            if isinstance(entry, list):
-                results.append('[%s]' % convert(entry, '-'))
-            else:
-                results.append(entry)
-        return splitter.join(results)
+def _xml_name_checker():
+    # to prevent import cycles
+    from sphinx.builders.epub3 import _XML_NAME_PATTERN
 
-    start_chars_regex = convert(name_start_chars)
-    name_chars_regex = convert(name_chars)
-    return re.compile(f'({start_chars_regex})({start_chars_regex}|{name_chars_regex})*')
+    return _XML_NAME_PATTERN
 
 
 deprecated_alias('sphinx.util',
@@ -410,6 +389,7 @@ def convert(entries: Any, splitter: str = '|') -> str:
                      'rfc1123_to_epoch': _http_date.rfc1123_to_epoch,
                      'save_traceback': _exceptions.save_traceback,
                      'format_exception_cut_frames': _exceptions.format_exception_cut_frames,
+                     'xmlname_checker': _xml_name_checker,
                  },
                  RemovedInSphinx70Warning,
                  {
@@ -422,4 +402,5 @@ def convert(entries: Any, splitter: str = '|') -> str:
                      'rfc1123_to_epoch': 'sphinx.http_date.rfc1123_to_epoch',
                      'save_traceback': 'sphinx.exceptions.save_traceback',
                      'format_exception_cut_frames': 'sphinx.exceptions.format_exception_cut_frames',  # NoQA: E501
+                     'xmlname_checker': 'sphinx.builders.epub3._XML_NAME_PATTERN',
                  })
diff --git a/tests/test_build_epub.py b/tests/test_build_epub.py
@@ -7,6 +7,8 @@
 
 import pytest
 
+from sphinx.builders.epub3 import _XML_NAME_PATTERN
+
 
 # check given command is runnable
 def runnable(command):
@@ -382,3 +384,9 @@ def test_run_epubcheck(app):
             print(exc.stdout.decode('utf-8'))
             print(exc.stderr.decode('utf-8'))
             raise AssertionError('epubcheck exited with return code %s' % exc.returncode)
+
+
+def test_xml_name_pattern_check():
+    assert _XML_NAME_PATTERN.match('id-pub')
+    assert _XML_NAME_PATTERN.match('webpage')
+    assert not _XML_NAME_PATTERN.match('1bfda21')
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -6,7 +6,7 @@
 import pytest
 
 from sphinx.errors import ExtensionError
-from sphinx.util import encode_uri, ensuredir, import_object, parselinenos, xmlname_checker
+from sphinx.util import encode_uri, ensuredir, import_object, parselinenos
 
 
 def test_encode_uri():
@@ -75,10 +75,3 @@ def test_parselinenos():
         parselinenos('-', 10)
     with pytest.raises(ValueError):
         parselinenos('3-1', 10)
-
-
-def test_xmlname_check():
-    checker = xmlname_checker()
-    assert checker.match('id-pub')
-    assert checker.match('webpage')
-    assert not checker.match('1bfda21')