Skip to content

Commit

Permalink
add option blockTextElements #78
Browse files Browse the repository at this point in the history
  • Loading branch information
taoqf committed Oct 27, 2020
1 parent d7a8c29 commit 3e32856
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 24 deletions.
9 changes: 6 additions & 3 deletions README.md
Expand Up @@ -74,10 +74,13 @@ Parse given data, and return root of the generated DOM.
```js
{
lowerCaseTagName: false, // convert tag name to lower case (hurt performance heavily)
script: true, // retrieve content in <script> (hurt performance slightly)
style: true, // retrieve content in <style> (hurt performance slightly)
pre: true, // retrieve content in <pre> (hurt performance slightly)
comment: false // retrieve comments (hurt performance slightly)
blockTextElements: {
script: true, // keep text content when parsing
noscript: true, // keep text content when parsing
style: true, // keep text content when parsing
pre: true // keep text content when parsing
}
}
```

Expand Down
53 changes: 34 additions & 19 deletions src/nodes/html.ts
Expand Up @@ -648,23 +648,13 @@ const kElementsClosedByClosing = {
th: { tr: true, table: true, TR: true, TABLE: true },
TH: { tr: true, table: true, TR: true, TABLE: true }
};
const kBlockTextElements = {
script: true,
SCRIPT: true,
noscript: true,
NOSCRIPT: true,
style: true,
STYLE: true,
pre: true,
PRE: true
};

export interface Options {
lowerCaseTagName: boolean;
script: boolean;
style: boolean;
pre: boolean;
comment: boolean;
blockTextElements: {
[tag: string]: boolean;
};
}

const frameflag = 'documentfragmentcontainer';
Expand All @@ -675,10 +665,35 @@ const frameflag = 'documentfragmentcontainer';
* @param {string} data html
* @return {HTMLElement} root element
*/
export function parse(data: string, options?: Options): HTMLElement & { valid: boolean };
export function parse(data: string, options?: Options & { noFix: false }): HTMLElement & { valid: boolean };
export function parse(data: string, options?: Options & { noFix: true }): (HTMLElement | TextNode) & { valid: boolean };
export function parse(data: string, options = { pre: true, style: true, script: true, lowerCaseTagName: false, comment: false } as Options & { noFix?: boolean }) {
export function parse(data: string, options?: Partial<Options>): HTMLElement & { valid: boolean };
export function parse(data: string, options?: Partial<Options> & { noFix: false }): HTMLElement & { valid: boolean };
export function parse(data: string, options?: Partial<Options> & { noFix: true }): (HTMLElement | TextNode) & { valid: boolean };
export function parse(data: string, options = { lowerCaseTagName: false, comment: false } as Partial<Options & { noFix: boolean }>) {
const elements = options.blockTextElements || {
script: true,
noscript: true,
style: true,
pre: true
};
const element_names = Object.keys(elements);
const kBlockTextElements = element_names.map((it) => {
return new RegExp(it, 'i');
});
const kIgnoreElements = element_names.filter((it) => {
return elements[it];
}).map((it) => {
return new RegExp(it, 'i');
});
function element_should_be_ignore(tag: string) {
return kIgnoreElements.some((it) => {
return it.test(tag);
});
}
function is_block_text_element(tag: string) {
return kBlockTextElements.some((it) => {
return it.test(tag);
});
}
const root = new HTMLElement(null, {});
let currentParent = root;
const stack = [root];
Expand Down Expand Up @@ -728,7 +743,7 @@ export function parse(data: string, options = { pre: true, style: true, script:
// https://github.com/taoqf/node-html-parser/issues/38
currentParent = currentParent.appendChild(new HTMLElement(match[2], attrs, match[3]));
stack.push(currentParent);
if (kBlockTextElements[match[2]]) {
if (is_block_text_element(match[2])) {
// a little test to find next </script> or </style> ...
const closeMarkup = `</${match[2]}>`;
const index = (() => {
Expand All @@ -737,7 +752,7 @@ export function parse(data: string, options = { pre: true, style: true, script:
}
return data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
})();
if (options[match[2]]) {
if (element_should_be_ignore(match[2])) {
let text: string;
if (index === -1) {
// there is no matching ending for the text element.
Expand Down
2 changes: 1 addition & 1 deletion test/html.js
Expand Up @@ -539,7 +539,7 @@ describe('HTML Parser', function () {
it('set content pre', function () {
const root = parseHTML(`<html><head></head><body></body></html>`);
const body = root.querySelector("body");
body.set_content(`<pre>this is some preformatted text</pre>`, { pre: true });
body.set_content(`<pre>this is some preformatted text</pre>`);
root.toString().should.eql('<html><head></head><body><pre>this is some preformatted text</pre></body></html>')
});
});
Expand Down
41 changes: 40 additions & 1 deletion test/pre.js
Expand Up @@ -9,7 +9,46 @@ describe('pre tag', function () {
</div>
</div>
`;
const root = parse(html);
const root = parse(html, {
blockTextElements: {
script: true,
noscript: true,
style: true,
pre: true
}
});
root.toString().should.eql(html);
});
it('should ignore pre tag', function () {
const html = `
<div class="language-python highlighter-rouge">
<div class="highlight"> <pre class="highlight"><code><span class="k">print</span><span class="p">(</span><span class="s">'hello'</span><span class="p">)</span><br><span class="n">i</span> <span class="o">=</span> <span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><br></code></pre>
</div>
</div>
`;
const root = parse(html, {
blockTextElements: {
pre: false
}
});
root.toString().should.eql(`
<div class="language-python highlighter-rouge">
<div class="highlight"> <pre class="highlight"></pre>
</div>
</div>
`);
});
it('do not treat pre as text block element', function () {
const html = `<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">print</span><span class="p">(</span><span class="s">'hello'</span><span class="p">)</span><br><span class="n">i</span><span class="o">=</span><span class="n">i</span><span class="o">+</span><span class="mi">1</span><br></code></pre>
</div>
</div>
`;
const root = parse(html, {
blockTextElements: {}
});
const div = root.firstChild.firstChild;
const pre = div.firstChild;
const code = pre.firstChild;
code.childNodes.length.should.eql(11);
});
});

0 comments on commit 3e32856

Please sign in to comment.