Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parser agnostic i18n Locale transform #12238

Open
wants to merge 29 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
0ab14a0
WIP: New i18n logic based on inline_parse function
n-peugnet Apr 7, 2024
2682f8b
Avoid 'Literal block expected; none found.' warnings
n-peugnet Apr 7, 2024
38c004b
Update gettext builder output in tests
n-peugnet Apr 7, 2024
9a2510a
parse_inline's input string is always a string
n-peugnet Apr 7, 2024
3c97944
Only output a single paragraph in RSTParser's parse_inline
n-peugnet Apr 7, 2024
66b14d3
Image nodes are handled separately, and literal can simply be manuall…
n-peugnet Apr 7, 2024
5994ca5
Fix ruff lints + more useful comment
n-peugnet Apr 7, 2024
0794348
Fix last ruff error
n-peugnet Apr 8, 2024
358c611
Remove unused type:ignore annotation
n-peugnet Apr 8, 2024
94b9b9d
Simplify parse_inline literal block handling for rST
n-peugnet Apr 14, 2024
17455d6
Fix rendering for parsed-literals
n-peugnet Apr 15, 2024
c10b0ec
Merge remote-tracking branch 'origin/master' into new-inline-parse-i1…
n-peugnet Apr 27, 2024
0fc3f28
Regression test for parsed literals translation
n-peugnet Apr 14, 2024
fe7675f
Regression test for strange markup
n-peugnet Apr 21, 2024
7f94dcf
Directly use Inliner.parse() instead of 'Text' initial_state
n-peugnet May 8, 2024
14e4833
Add warning messages to doccument in parse_inline
n-peugnet May 8, 2024
c81d310
Fix ruff lints
n-peugnet May 9, 2024
910dbd7
Apply ruff format patch
n-peugnet May 9, 2024
3b92bfb
Refactor to fix MyPy error
n-peugnet May 10, 2024
8c40fb6
Add tests for parse_inline
n-peugnet May 11, 2024
ca54a6c
Fix ruff lints
n-peugnet May 11, 2024
7de9a40
Skip parse_inline() tests with docutils < 0.19
n-peugnet May 11, 2024
7500a00
Try to ignore incorrect mypy lint
n-peugnet May 11, 2024
e7e7fad
Remove unneeded inliner, title and section options of memo
n-peugnet May 12, 2024
dba713d
Add test for substitution reference for parse_inline
n-peugnet May 12, 2024
2c37780
Simplify parse_inline for RST
n-peugnet May 19, 2024
3d5441d
Add test for titles in parse_inline
n-peugnet May 19, 2024
463ac3d
Merge remote-tracking branch 'origin/master' into new-inline-parse-i1…
n-peugnet May 19, 2024
d84d4e6
Fix <unknown> source file in translated warnings
n-peugnet May 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions sphinx/io.py
Expand Up @@ -143,6 +143,12 @@ def setup(self, app: Sphinx) -> None:
if transform in self.transforms:
self.transforms.remove(transform)

def parse(self) -> None:
"""Override the BaseReader parse method to call self.parser.parse_inline()."""
self.document = document = self.new_document()
self.parser.parse_inline(self.input, document)
document.current_source = document.current_line = None


class SphinxDummyWriter(UnfilteredWriter):
"""Dummy writer module used for generating doctree."""
Expand Down
28 changes: 28 additions & 0 deletions sphinx/parsers.py
Expand Up @@ -46,6 +46,10 @@
self.config = app.config
self.env = app.env

def parse_inline(self, inputstring: str, document: nodes.document) -> None:
"""Parse the inline elements of a text block and generate a document tree."""
raise NotImplementedError('Parser subclasses must implement parse_inline')

Check failure on line 51 in sphinx/parsers.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (EM101)

sphinx/parsers.py:51:35: EM101 Exception must not use a string literal, assign to variable first


class RSTParser(docutils.parsers.rst.Parser, Parser):
"""A reST parser for Sphinx."""
Expand All @@ -60,6 +64,30 @@
transforms.remove(SmartQuotes)
return transforms

def parse_inline(self, inputstring: str, document: nodes.document) -> None:
"""Parse inline syntax from text and generate a document tree."""
# Avoid "Literal block expected; none found." warnings.
has_literal = inputstring.endswith('::')
if has_literal:
inputstring = inputstring[:-2]

self.setup_parse(inputstring, document) # type: ignore[arg-type]
self.statemachine = states.RSTStateMachine(
state_classes=self.state_classes,
initial_state='Text',
debug=document.reporter.debug_flag,
)

inputlines = StringList([inputstring], document.current_source)

self.decorate(inputlines)
self.statemachine.run(inputlines, document, inliner=self.inliner)
self.finish_parse()
if has_literal:
p = document[0]
assert isinstance(p, nodes.paragraph)
p += nodes.Text(':')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well this was my initial idea, but it was a lot more complicated to implement. I don't remember if I managed to make it work in the end, but if I did, then it had the same result as using Text as the initial_state (the :: without literal block were still causing issues), as I did on line 77

initial_state='Text',

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok yes I indeed managed to make it work, but it looked like this:

diff --git a/sphinx/parsers.py b/sphinx/parsers.py
index 09ee7e8ff..a99c6d498 100644
--- a/sphinx/parsers.py
+++ b/sphinx/parsers.py
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING
 import docutils.parsers
 import docutils.parsers.rst
 from docutils import nodes
-from docutils.parsers.rst import states
+from docutils.parsers.rst import states, languages
 from docutils.statemachine import StringList
 from docutils.transforms.universal import SmartQuotes
 
@@ -71,18 +71,26 @@ class RSTParser(docutils.parsers.rst.Parser, Parser):
         if has_literal:
             inputstring = inputstring[:-2]
 
-        self.setup_parse(inputstring, document)  # type: ignore[arg-type]
-        self.statemachine = states.RSTStateMachine(
-            state_classes=self.state_classes,
-            initial_state='Text',
-            debug=document.reporter.debug_flag,
-        )
-
-        inputlines = StringList([inputstring], document.current_source)
-
-        self.decorate(inputlines)
-        self.statemachine.run(inputlines, document, inliner=self.inliner)
-        self.finish_parse()
+        language = languages.get_language(
+            document.settings.language_code, document.reporter)
+        if self.inliner is None:
+            inliner = states.Inliner()
+        else:
+            inliner = self.inliner
+        inliner.init_customizations(document.settings)
+        memo = states.Struct(document=document,
+                             reporter=document.reporter,
+                             language=language,
+                             title_styles=[],
+                             section_level=0,
+                             section_bubble_up_kludge=False,
+                             inliner=inliner)
+        memo.reporter.get_source_and_line = lambda x: (document.source, x)
+        textnodes, _ = inliner.parse(inputstring, 1, memo, document)
+        p = nodes.paragraph(inputstring, '', *textnodes)
+        p.source = document.source
+        p.line = 1
+        document.append(p)
         if has_literal:
             p = document[0]
             assert isinstance(p, nodes.paragraph)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then it had the same result as using Text

Text also passes definition lists and section titles; its definitely much cleaner to use the proper inline parsing, even if docutils does not make this as easy 😒

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

textnodes, _ = inliner.parse(inputstring, 1, memo, document)

here you should also have parse_inline take the actual line number and use that, plus the _ is system_message nodes that should be appended to the paragraph

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, I'll try to add it, but not sure how to test it.

Well.. if this proposal is to be accepted, then really it needs to have proper "generic" tests, not just specific to the i18n use case, as obviously it could be used for other use cases

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok I get it. Sorry I was too focused on the problem I was trying to solve, but you are right.

Regardless of the implementation details, what do you think about this proposal? As this is more of a proof of concept than a finished product.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeh I mean, coming from the MyST perspective, in principle I am certainly in favor of removing "rST hard-coded" aspects of the code base 😄 (the other big problematic aspect of sphinx for this is executablebooks/MyST-Parser#228)

But indeed, it is quite a "core" addition to sphinx, more broad reaching than just this use case,
so I would obviously want to be very careful (and have good agreement from other maintainers) before merging anything

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Of course, I marked it as "draft" to make it clearer that it is not ready yet.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@chrisjsewell

This should all be using the self.inliner to only parse inline syntaxes

☑️

here you should also have parse_inline take the actual line number and use that

☑️

plus the _ is system_message nodes that should be appended to the paragraph

☑️

Also I started to add some tests for this new function.


def parse(self, inputstring: str | StringList, document: nodes.document) -> None:
"""Parse text and generate a document tree."""
self.setup_parse(inputstring, document) # type: ignore[arg-type]
Expand Down
70 changes: 12 additions & 58 deletions sphinx/transforms/i18n.py
Expand Up @@ -5,7 +5,6 @@
import contextlib
from os import path
from re import DOTALL, match
from textwrap import indent
from typing import TYPE_CHECKING, Any, TypeVar

from docutils import nodes
Expand All @@ -21,7 +20,6 @@
from sphinx.util.i18n import docname_to_domain
from sphinx.util.index_entries import split_index_msg
from sphinx.util.nodes import (
IMAGE_TYPE_NODES,
LITERAL_TYPE_NODES,
NodeMatcher,
extract_messages,
Expand Down Expand Up @@ -380,25 +378,12 @@ def apply(self, **kwargs: Any) -> None:
node['translated'] = True
continue

# Avoid "Literal block expected; none found." warnings.
# If msgstr ends with '::' then it cause warning message at
# parser.parse() processing.
# literal-block-warning is only appear in avobe case.
if msgstr.strip().endswith('::'):
msgstr += '\n\n dummy literal'
# dummy literal node will discard by 'patch = patch[0]'

# literalblock need literal block notation to avoid it become
# paragraph.
# literalblock can not contain references or terms
if isinstance(node, LITERAL_TYPE_NODES):
msgstr = '::\n\n' + indent(msgstr, ' ' * 3)
continue

patch = publish_msgstr(self.app, msgstr, source,
node.line, self.config, settings) # type: ignore[arg-type]
# FIXME: no warnings about inconsistent references in this part
# XXX doctest and other block markup
if not isinstance(patch, nodes.paragraph):
continue # skip for now

updater = _NodeUpdater(node, patch, self.document, noqa=False)
processed = updater.update_title_mapping()
Expand Down Expand Up @@ -453,45 +438,23 @@ def apply(self, **kwargs: Any) -> None:
node['alt'] = msgstr
continue

# Avoid "Literal block expected; none found." warnings.
# If msgstr ends with '::' then it cause warning message at
# parser.parse() processing.
# literal-block-warning is only appear in avobe case.
if msgstr.strip().endswith('::'):
msgstr += '\n\n dummy literal'
# dummy literal node will discard by 'patch = patch[0]'
if isinstance(node, nodes.image) and node.get('uri') == msg:
node['uri'] = msgstr
continue

# literalblock need literal block notation to avoid it become
# paragraph.
# literalblock do not need to be parsed as they do not contain inline syntax
if isinstance(node, LITERAL_TYPE_NODES):
msgstr = '::\n\n' + indent(msgstr, ' ' * 3)

# Structural Subelements phase1
# There is a possibility that only the title node is created.
# see: https://docutils.sourceforge.io/docs/ref/doctree.html#structural-subelements
if isinstance(node, nodes.title):
# This generates: <section ...><title>msgstr</title></section>
msgstr = msgstr + '\n' + '=' * len(msgstr) * 2
node.children = [nodes.Text(msgstr)]
# for highlighting that expects .rawsource and .astext() are same.
node.rawsource = node.astext()
node['translated'] = True
continue

patch = publish_msgstr(self.app, msgstr, source,
node.line, self.config, settings) # type: ignore[arg-type]
# Structural Subelements phase2
if isinstance(node, nodes.title):
# get <title> node that placed as a first child
patch = patch.next_node() # type: ignore[assignment]

# ignore unexpected markups in translation message
unexpected: tuple[type[nodes.Element], ...] = (
nodes.paragraph, # expected form of translation
nodes.title, # generated by above "Subelements phase2"
)

# following types are expected if
# config.gettext_additional_targets is configured
unexpected += LITERAL_TYPE_NODES
unexpected += IMAGE_TYPE_NODES

if not isinstance(patch, unexpected):
if not isinstance(patch, nodes.paragraph):
continue # skip

updater = _NodeUpdater(node, patch, self.document, noqa)
Expand All @@ -502,15 +465,6 @@ def apply(self, **kwargs: Any) -> None:
updater.update_pending_xrefs()
updater.update_leaves()

# for highlighting that expects .rawsource and .astext() are same.
if isinstance(node, LITERAL_TYPE_NODES):
node.rawsource = node.astext()

if isinstance(node, nodes.image) and node.get('alt') != msg:
node['uri'] = patch['uri']
node['translated'] = False
continue # do not mark translated

node['translated'] = True # to avoid double translation

if 'index' in self.config.gettext_additional_targets:
Expand Down
3 changes: 1 addition & 2 deletions sphinx/util/nodes.py
Expand Up @@ -266,8 +266,7 @@ def extract_messages(doctree: Element) -> Iterable[tuple[Element, str]]:
if node.get('alt'):
yield node, node['alt']
if node.get('translatable'):
image_uri = node.get('original_uri', node['uri'])
msg = f'.. image:: {image_uri}'
msg = node.get('original_uri', node['uri'])
else:
msg = ''
elif isinstance(node, nodes.meta):
Expand Down
8 changes: 4 additions & 4 deletions tests/roots/test-intl/xx/LC_MESSAGES/figure.po
Expand Up @@ -40,14 +40,14 @@ msgstr "IMAGE URL AND ALT"
msgid "img"
msgstr "IMG -> I18N"

msgid ".. image:: img.png"
msgstr ".. image:: i18n.png"
msgid "img.png"
msgstr "i18n.png"

msgid "i18n"
msgstr "I18N -> IMG"

msgid ".. image:: i18n.png"
msgstr ".. image:: img.png"
msgid "i18n.png"
msgstr "img.png"

msgid "image on substitution"
msgstr "IMAGE ON SUBSTITUTION"
Expand Down
Expand Up @@ -28,11 +28,11 @@ msgstr "SUBSTITUTED IMAGE |subst_epilog_2| HERE."
msgid "subst_prolog_2"
msgstr "SUBST_PROLOG_2 TRANSLATED"

msgid ".. image:: /img.png"
msgstr ".. image:: /i18n.png"
msgid "/img.png"
msgstr "/i18n.png"

msgid "subst_epilog_2"
msgstr "SUBST_EPILOG_2 TRANSLATED"

msgid ".. image:: /i18n.png"
msgstr ".. image:: /img.png"
msgid "/i18n.png"
msgstr "/img.png"
4 changes: 2 additions & 2 deletions tests/test_builders/test_build_gettext.py
Expand Up @@ -207,11 +207,11 @@ def test_gettext_prolog_epilog_substitution(app):
"This is content that contains |subst_prolog_1|.",
"Substituted image |subst_prolog_2| here.",
"subst_prolog_2",
".. image:: /img.png",
"/img.png",
"This is content that contains |subst_epilog_1|.",
"Substituted image |subst_epilog_2| here.",
"subst_epilog_2",
".. image:: /i18n.png",
"/i18n.png",
]


Expand Down