Skip to content

Commit

Permalink
StreamParser: add fragment parse methods
Browse files Browse the repository at this point in the history
  • Loading branch information
jhy committed Jan 10, 2024
1 parent 2b443df commit 1f1f72d
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 27 deletions.
27 changes: 13 additions & 14 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public class HtmlTreeBuilder extends TreeBuilder {
private boolean baseUriSetFromDoc;
private @Nullable Element headElement; // the current head element
private @Nullable FormElement formElement; // the current form element
private @Nullable Element contextElement; // fragment parse context -- could be null even if fragment parsing
private @Nullable Element contextElement; // fragment parse root; name only copy of context. could be null even if fragment parsing
private ArrayList<Element> formattingElements; // active (open) formatting elements
private ArrayList<HtmlTreeBuilderState> tmplInsertMode; // stack of Template Insertion modes
private List<Token.Character> pendingTableCharacters; // chars in table to be shifted out
Expand Down Expand Up @@ -94,20 +94,19 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) {
fragmentParsing = false;
}

@Override List<Node> doParseFragment(@Nullable Element context) {
@Override void initialiseParseFragment(@Nullable Element context) {
// context may be null
state = HtmlTreeBuilderState.Initial;
contextElement = context;
fragmentParsing = true;
Element root = null;

if (context != null) {
final String contextName = context.normalName();
contextElement = new Element(tagFor(contextName, settings), baseUri);
if (context.ownerDocument() != null) // quirks setup:
doc.quirksMode(context.ownerDocument().quirksMode());

// initialise the tokeniser state:
String contextTag = context.normalName();
switch (contextTag) {
switch (contextName) {
case "title":
case "textarea":
tokeniser.transition(TokeniserState.Rcdata);
Expand All @@ -132,9 +131,8 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) {
default:
tokeniser.transition(TokeniserState.Data);
}
root = new Element(tagFor(contextTag, settings), baseUri);
doc.appendChild(root);
push(root);
doc.appendChild(contextElement);
push(contextElement);
resetInsertionMode();

// setup form element to nearest form on context (up ancestor chain). ensures form controls are associated
Expand All @@ -148,15 +146,16 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) {
formSearch = formSearch.parent();
}
}
}

runParser();
if (context != null) {
@Override List<Node> completeParseFragment() {
if (contextElement != null) {
// depending on context and the input html, content may have been added outside of the root el
// e.g. context=p, input=div, the div will have been pushed out.
List<Node> nodes = root.siblingNodes();
List<Node> nodes = contextElement.siblingNodes();
if (!nodes.isEmpty())
root.insertChildren(-1, nodes);
return root.childNodes();
contextElement.insertChildren(-1, nodes);
return contextElement.childNodes();
}
else
return doc.childNodes();
Expand Down
43 changes: 41 additions & 2 deletions src/main/java/org/jsoup/parser/StreamParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import java.io.UncheckedIOException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Queue;
import java.util.Spliterator;
Expand Down Expand Up @@ -67,7 +68,7 @@ public StreamParser(Parser parser) {
}

/**
Provide the input for a parse. The input is not read until a consuming operation is called.
Provide the input for a Document parse. The input is not read until a consuming operation is called.
@param input the input to be read.
@param baseUri the URL of this input, for absolute link resolution
@return this parser, for chaining
Expand All @@ -81,7 +82,7 @@ public StreamParser parse(Reader input, String baseUri) {
}

/**
Provide the input for a parse. The input is not read until a consuming operation is called.
Provide the input for a Document parse. The input is not read until a consuming operation is called.
@param input the input to be read
@param baseUri the URL of this input, for absolute link resolution
@return this parser
Expand All @@ -90,6 +91,32 @@ public StreamParser parse(String input, String baseUri) {
return parse(new StringReader(input), baseUri);
}

/**
Provide the input for a fragment parse. The input is not read until a consuming operation is called.
@param input the input to be read
@param context the optional fragment context element
@param baseUri the URL of this input, for absolute link resolution
@return this parser
@see #completeFragment()
*/
public StreamParser parseFragment(Reader input, @Nullable Element context, String baseUri) {
parse(input, baseUri);
treeBuilder.initialiseParseFragment(context);
return this;
}

/**
Provide the input for a fragment parse. The input is not read until a consuming operation is called.
@param input the input to be read
@param context the optional fragment context element
@param baseUri the URL of this input, for absolute link resolution
@return this parser
@see #completeFragment()
*/
public StreamParser parseFragment(String input, @Nullable Element context, String baseUri) {
return parseFragment(new StringReader(input), context, baseUri);
}

/**
Creates a {@link Stream} of {@link Element}s, with the input being parsed as each element is consumed. Each
Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that
Expand Down Expand Up @@ -162,6 +189,18 @@ public Document complete() throws IOException {
return doc;
}

/**
When initialized as a fragment parse, runs the parser until the input is fully read, and returns the completed
fragment child nodes.
@return the completed child nodes
@throws IOException if an I/O error occurs
@see #parseFragment(Reader, Element, String)
*/
public List<Node> completeFragment() throws IOException {
treeBuilder.runParser();
return treeBuilder.completeParseFragment();
}

/**
Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the
input will be parsed until the first match is found, or the input is completely read.
Expand Down
12 changes: 9 additions & 3 deletions src/main/java/org/jsoup/parser/TreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,16 @@ Document parse(Reader input, String baseUri, Parser parser) {

List<Node> parseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser) {
initialiseParse(new StringReader(inputFragment), baseUri, parser);
return doParseFragment(context);
initialiseParseFragment(context);
runParser();
return completeParseFragment();
}

void initialiseParseFragment(@Nullable Element context) {
// in Html, sets up context; no-op in XML
}

abstract List<Node> doParseFragment(@Nullable Element context);
abstract List<Node> completeParseFragment();

/** Set the node listener, which will then get callbacks for node insert and removals. */
void nodeListener(NodeVisitor nodeListener) {
Expand All @@ -102,7 +108,7 @@ void runParser() {
boolean stepParser() {
// if we have reached the end already, step by popping off the stack, to hit nodeRemoved callbacks:
if (currentToken.type == Token.TokenType.EOF) {
if (stack.isEmpty()) return false;
if (stack == null || stack.isEmpty()) return false; // stack will be null if TB was closed, as in case of runParser() + completeFragment()
pop();
return true;
}
Expand Down
10 changes: 4 additions & 6 deletions src/main/java/org/jsoup/parser/XmlTreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ public class XmlTreeBuilder extends TreeBuilder {
@Override
protected void initialiseParse(Reader input, String baseUri, Parser parser) {
super.initialiseParse(input, baseUri, parser);
stack.add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack). Note not push()ed, so not onNodeInserted.
doc.outputSettings()
.syntax(Document.OutputSettings.Syntax.xml)
.escapeMode(Entities.EscapeMode.xhtml)
Expand All @@ -49,6 +48,10 @@ Document parse(String input, String baseUri) {
return parse(new StringReader(input), baseUri, new Parser(this));
}

@Override List<Node> completeParseFragment() {
return doc.childNodes();
}

@Override
XmlTreeBuilder newInstance() {
return new XmlTreeBuilder();
Expand Down Expand Up @@ -164,9 +167,4 @@ protected void popStackToClose(Token.EndTag endTag) {
}
}
private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain

@Override List<Node> doParseFragment(@Nullable Element context) {
runParser();
return doc.childNodes();
}
}
106 changes: 104 additions & 2 deletions src/test/java/org/jsoup/parser/StreamParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@
import org.jsoup.integration.ParseTest;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import org.jspecify.annotations.NullMarked;
import org.junit.jupiter.api.Test;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;

import static org.junit.jupiter.api.Assertions.*;
Expand All @@ -37,6 +37,18 @@ void canStream() {
}
}

@Test
void canStreamXml() {
String html = "<outmost><DIV id=1>D1</DIV><div id=2>D2<p id=3><span>P One</p><p id=4>P Two</p></div><div id=5>D3<p id=6>P three</p>";
try (StreamParser parser = new StreamParser(Parser.xmlParser()).parse(html, "")) {
StringBuilder seen;
seen = new StringBuilder();
parser.stream().forEachOrdered(el -> trackSeen(el, seen));
assertEquals("DIV#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];outmost;", seen.toString());
// checks expected order, and the + indicates that element had a next sibling at time of emission
}
}

@Test void canIterate() {
// same as stream, just a different interface
String html = "<title>Test</title></head><div id=1>D1</div><div id=2>D2<p id=3><span>P One</p><p id=4>P Two</p></div><div id=5>D3<p id=6>P three</p>";
Expand Down Expand Up @@ -327,4 +339,94 @@ private static CharacterReader getReader(StreamParser streamer) {
// the reader should be closed as streamer is closed on completion of read
assertTrue(isClosed(streamer));
}

// Fragments

@Test
void canStreamFragment() {
String html = "<tr id=1><td>One</td><tr id=2><td>Two</td></tr><tr id=3><td>Three</td></tr>";
Element context = new Element("table");

try (StreamParser parser = new StreamParser(Parser.htmlParser()).parseFragment(html, context, "")) {
StringBuilder seen = new StringBuilder();
parser.stream().forEachOrdered(el -> trackSeen(el, seen));
assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;", seen.toString());
// checks expected order, and the + indicates that element had a next sibling at time of emission
// note that we don't get a full doc, just the fragment (and the context at the end of the stack)

assertTrue(isClosed(parser)); // as read to completion
}
}

@Test void canIterateFragment() {
// same as stream, just a different interface
String html = "<tr id=1><td>One</td><tr id=2><td>Two</td></tr><tr id=3><td>Three</td></tr>"; // missing </tr>, following <tr> infers it
Element context = new Element("table");

try(StreamParser parser = new StreamParser(Parser.htmlParser()).parseFragment(html, context, "")) {
StringBuilder seen = new StringBuilder();

Iterator<Element> it = parser.iterator();
while (it.hasNext()) {
trackSeen(it.next(), seen);
}

assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;", seen.toString());
// checks expected order, and the + indicates that element had a next sibling at time of emission
// note that we don't get a full doc, just the fragment (and the context at the end of the stack)

assertTrue(isClosed(parser)); // as read to completion
}
}

@Test
void canSelectAndCompleteFragment() throws IOException {
String html = "<tr id=1><td>One</td><tr id=2><td>Two</td></tr><tr id=3><td>Three</td></tr>";
Element context = new Element("table");

try (StreamParser parser = new StreamParser(Parser.htmlParser()).parseFragment(html, context, "")) {
Element first = parser.expectNext("td");
assertEquals("One", first.ownText());

Element el = parser.expectNext("td");
assertEquals("Two", el.ownText());

el = parser.expectNext("td");
assertEquals("Three", el.ownText());

el = parser.selectNext("td");
assertNull(el);

List<Node> nodes = parser.completeFragment();
assertEquals(1, nodes.size()); // should be the inferred tbody
Node tbody = nodes.get(0);
assertEquals("tbody", tbody.nodeName());
List<Node> trs = tbody.childNodes();
assertEquals(3, trs.size()); // should be the three TRs
assertSame(trs.get(0).childNode(0), first); // tr -> td

assertSame(parser.document(), first.ownerDocument()); // the shell document for this fragment
}
}

@Test
void canStreamFragmentXml() throws IOException {
String html = "<tr id=1><td>One</td></tr><tr id=2><td>Two</td></tr><tr id=3><td>Three</td></tr>";
Element context = new Element("Other");

try (StreamParser parser = new StreamParser(Parser.xmlParser()).parseFragment(html, context, "")) {
StringBuilder seen = new StringBuilder();
parser.stream().forEachOrdered(el -> trackSeen(el, seen));
assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;", seen.toString());
// checks expected order, and the + indicates that element had a next sibling at time of emission
// note that we don't get a full doc, just the fragment

assertTrue(isClosed(parser)); // as read to completion

List<Node> nodes = parser.completeFragment();
assertEquals(3, nodes.size());
assertEquals("tr", nodes.get(0).nodeName());
}
}

}

0 comments on commit 1f1f72d

Please sign in to comment.