From d4d06eda40e2a39003032c882730a1d869d4af10 Mon Sep 17 00:00:00 2001 From: Ron S Date: Sat, 22 May 2021 18:24:18 -0400 Subject: [PATCH 1/4] fix issue taoqf#115 --- src/nodes/html.ts | 38 +++++++++++++++++++++++++++++++++++++- test/html.js | 8 +++++++- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/src/nodes/html.ts b/src/nodes/html.ts index 4d1f21b..2e51f75 100644 --- a/src/nodes/html.ts +++ b/src/nodes/html.ts @@ -133,6 +133,42 @@ export default class HTMLElement extends Node { return JSON.stringify(attr.replace(/"/g, '"')); } + + /** + * Trim all whitespace except single leading/trailing non-breaking space + * @param text string to trim + * @returns {string} trimmed value + * @private + */ + private trimTextNodeWhitespace(text: string): string { + let i = 0; + let startPos; + let endPos; + + while (i >= 0 && i < text.length) { + if (/\S/.test(text[i])) { + if (startPos === undefined) { + startPos = i; + i = text.length; + } else { + endPos = i; + i = void 0; + } + } + + if (startPos === undefined) i++; + else i--; + } + + if (startPos === undefined) startPos = 0; + if (endPos === undefined) endPos = text.length - 1; + + const hasLeadingSpace = startPos > 0 && /[^\S\r\n]/.test(text[startPos-1]); + const hasTrailingSpace = endPos < (text.length - 1) && /[^\S\r\n]/.test(text[endPos+1]); + + return (hasLeadingSpace ? ' ' : '') + text.slice(startPos, endPos + 1) + (hasTrailingSpace ? ' ' : ''); + } + /** * Creates an instance of HTMLElement. * @param keyAttrs id and class attribute @@ -401,7 +437,7 @@ export default class HTMLElement extends Node { if ((node as TextNode).isWhitespace) { return; } - node.rawText = node.rawText.trim(); + node.rawText = this.trimTextNodeWhitespace(node.rawText); } else if (node.nodeType === NodeType.ELEMENT_NODE) { (node as HTMLElement).removeWhitespace(); } diff --git a/test/html.js b/test/html.js index 6c9b8de..f8c6c01 100644 --- a/test/html.js +++ b/test/html.js @@ -198,7 +198,7 @@ describe('HTML Parser', function () { describe('#removeWhitespace()', function () { it('should remove whitespaces while preserving nodes with content', function () { - const root = parseHTML('

\r \n \t

123

'); + const root = parseHTML('

\r \n \t

123

'); const p = new HTMLElement('p', {}, '', root); p.appendChild(new HTMLElement('h5', {}, '')) @@ -206,6 +206,12 @@ describe('HTML Parser', function () { root.firstChild.removeWhitespace().should.eql(p); }); + + it('should preserve legitimate leading/trailing whitespace in TextNode', function () { + parseHTML('

Hello World!

').removeWhitespace().firstChild.text.should.eql('Hello World!'); + parseHTML('

\t\nHello\n\tWorld!

').removeWhitespace().firstChild.text.should.eql('HelloWorld!'); + parseHTML('

\t\n Hello \n\tWorld!

').removeWhitespace().firstChild.text.should.eql(' Hello World!'); + }); }); describe('#rawAttributes', function () { From b75a51d0279f1a1f0ae382e78e712192b40ef024 Mon Sep 17 00:00:00 2001 From: Ron S Date: Sat, 22 May 2021 18:58:49 -0400 Subject: [PATCH 2/4] Refactored to implement with structuredText --- src/nodes/html.ts | 39 ++------------------------------------- src/nodes/text.ts | 39 +++++++++++++++++++++++++++++++++++++++ test/html.js | 2 +- 3 files changed, 42 insertions(+), 38 deletions(-) diff --git a/src/nodes/html.ts b/src/nodes/html.ts index 2e51f75..10a9a8c 100644 --- a/src/nodes/html.ts +++ b/src/nodes/html.ts @@ -134,41 +134,6 @@ export default class HTMLElement extends Node { return JSON.stringify(attr.replace(/"/g, '"')); } - /** - * Trim all whitespace except single leading/trailing non-breaking space - * @param text string to trim - * @returns {string} trimmed value - * @private - */ - private trimTextNodeWhitespace(text: string): string { - let i = 0; - let startPos; - let endPos; - - while (i >= 0 && i < text.length) { - if (/\S/.test(text[i])) { - if (startPos === undefined) { - startPos = i; - i = text.length; - } else { - endPos = i; - i = void 0; - } - } - - if (startPos === undefined) i++; - else i--; - } - - if (startPos === undefined) startPos = 0; - if (endPos === undefined) endPos = text.length - 1; - - const hasLeadingSpace = startPos > 0 && /[^\S\r\n]/.test(text[startPos-1]); - const hasTrailingSpace = endPos < (text.length - 1) && /[^\S\r\n]/.test(text[endPos+1]); - - return (hasLeadingSpace ? ' ' : '') + text.slice(startPos, endPos + 1) + (hasTrailingSpace ? ' ' : ''); - } - /** * Creates an instance of HTMLElement. * @param keyAttrs id and class attribute @@ -296,7 +261,7 @@ export default class HTMLElement extends Node { // Whitespace node, postponed output currentBlock.prependWhitespace = true; } else { - let text = node.text; + let text = (node).trimmedText; if (currentBlock.prependWhitespace) { text = ` ${text}`; currentBlock.prependWhitespace = false; @@ -437,7 +402,7 @@ export default class HTMLElement extends Node { if ((node as TextNode).isWhitespace) { return; } - node.rawText = this.trimTextNodeWhitespace(node.rawText); + node.rawText = (node).trimmedText; } else if (node.nodeType === NodeType.ELEMENT_NODE) { (node as HTMLElement).removeWhitespace(); } diff --git a/src/nodes/text.ts b/src/nodes/text.ts index 068666a..eabfa50 100644 --- a/src/nodes/text.ts +++ b/src/nodes/text.ts @@ -17,6 +17,45 @@ export default class TextNode extends Node { */ public nodeType = NodeType.TEXT_NODE; + private _trimmedText?: string; + + /** + * Returns text with all whitespace trimmed except single leading/trailing non-breaking space + */ + public get trimmedText() { + if (this._trimmedText !== undefined) return this._trimmedText; + + const text = this.rawText; + let i = 0; + let startPos; + let endPos; + + while (i >= 0 && i < text.length) { + if (/\S/.test(text[i])) { + if (startPos === undefined) { + startPos = i; + i = text.length; + } else { + endPos = i; + i = void 0; + } + } + + if (startPos === undefined) i++; + else i--; + } + + if (startPos === undefined) startPos = 0; + if (endPos === undefined) endPos = text.length - 1; + + const hasLeadingSpace = startPos > 0 && /[^\S\r\n]/.test(text[startPos-1]); + const hasTrailingSpace = endPos < (text.length - 1) && /[^\S\r\n]/.test(text[endPos+1]); + + this._trimmedText = (hasLeadingSpace ? ' ' : '') + text.slice(startPos, endPos + 1) + (hasTrailingSpace ? ' ' : ''); + + return this._trimmedText; + } + /** * Get unescaped text value of current node and its children. * @return {string} text content diff --git a/test/html.js b/test/html.js index f8c6c01..19eaff3 100644 --- a/test/html.js +++ b/test/html.js @@ -202,7 +202,7 @@ describe('HTML Parser', function () { const p = new HTMLElement('p', {}, '', root); p.appendChild(new HTMLElement('h5', {}, '')) - .appendChild(new TextNode('123')); + .appendChild(Object.assign(new TextNode('123'), { _trimmedText: '123' })); root.firstChild.removeWhitespace().should.eql(p); }); From 0195dd3abb08469cf320835947d080cccb9e239e Mon Sep 17 00:00:00 2001 From: Ron S Date: Sat, 22 May 2021 19:09:19 -0400 Subject: [PATCH 3/4] style: Normalize indents to tab --- src/nodes/text.ts | 56 +++++++++++++++++++++++------------------------ test/html.js | 8 +++---- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/src/nodes/text.ts b/src/nodes/text.ts index eabfa50..9b473d7 100644 --- a/src/nodes/text.ts +++ b/src/nodes/text.ts @@ -19,42 +19,42 @@ export default class TextNode extends Node { private _trimmedText?: string; - /** - * Returns text with all whitespace trimmed except single leading/trailing non-breaking space - */ + /** + * Returns text with all whitespace trimmed except single leading/trailing non-breaking space + */ public get trimmedText() { - if (this._trimmedText !== undefined) return this._trimmedText; + if (this._trimmedText !== undefined) return this._trimmedText; - const text = this.rawText; - let i = 0; - let startPos; - let endPos; + const text = this.rawText; + let i = 0; + let startPos; + let endPos; - while (i >= 0 && i < text.length) { - if (/\S/.test(text[i])) { - if (startPos === undefined) { - startPos = i; - i = text.length; - } else { - endPos = i; - i = void 0; - } - } + while (i >= 0 && i < text.length) { + if (/\S/.test(text[i])) { + if (startPos === undefined) { + startPos = i; + i = text.length; + } else { + endPos = i; + i = void 0; + } + } - if (startPos === undefined) i++; - else i--; - } + if (startPos === undefined) i++; + else i--; + } - if (startPos === undefined) startPos = 0; - if (endPos === undefined) endPos = text.length - 1; + if (startPos === undefined) startPos = 0; + if (endPos === undefined) endPos = text.length - 1; - const hasLeadingSpace = startPos > 0 && /[^\S\r\n]/.test(text[startPos-1]); - const hasTrailingSpace = endPos < (text.length - 1) && /[^\S\r\n]/.test(text[endPos+1]); + const hasLeadingSpace = startPos > 0 && /[^\S\r\n]/.test(text[startPos-1]); + const hasTrailingSpace = endPos < (text.length - 1) && /[^\S\r\n]/.test(text[endPos+1]); - this._trimmedText = (hasLeadingSpace ? ' ' : '') + text.slice(startPos, endPos + 1) + (hasTrailingSpace ? ' ' : ''); + this._trimmedText = (hasLeadingSpace ? ' ' : '') + text.slice(startPos, endPos + 1) + (hasTrailingSpace ? ' ' : ''); - return this._trimmedText; - } + return this._trimmedText; + } /** * Get unescaped text value of current node and its children. diff --git a/test/html.js b/test/html.js index 19eaff3..0b898e8 100644 --- a/test/html.js +++ b/test/html.js @@ -208,10 +208,10 @@ describe('HTML Parser', function () { }); it('should preserve legitimate leading/trailing whitespace in TextNode', function () { - parseHTML('

Hello World!

').removeWhitespace().firstChild.text.should.eql('Hello World!'); - parseHTML('

\t\nHello\n\tWorld!

').removeWhitespace().firstChild.text.should.eql('HelloWorld!'); - parseHTML('

\t\n Hello \n\tWorld!

').removeWhitespace().firstChild.text.should.eql(' Hello World!'); - }); + parseHTML('

Hello World!

').removeWhitespace().firstChild.text.should.eql('Hello World!'); + parseHTML('

\t\nHello\n\tWorld!

').removeWhitespace().firstChild.text.should.eql('HelloWorld!'); + parseHTML('

\t\n Hello \n\tWorld!

').removeWhitespace().firstChild.text.should.eql(' Hello World!'); + }); }); describe('#rawAttributes', function () { From d914efad83817511becf7331f60fd79d1a807446 Mon Sep 17 00:00:00 2001 From: Ron S Date: Sat, 22 May 2021 19:34:51 -0400 Subject: [PATCH 4/4] test: Improve testing --- test/html.js | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/test/html.js b/test/html.js index 0b898e8..e84c20b 100644 --- a/test/html.js +++ b/test/html.js @@ -126,10 +126,10 @@ describe('HTML Parser', function () { const script = root.firstChild; const style = root.lastChild; script.childNodes.should.not.be.empty; - script.childNodes.should.eql([new TextNode('1', script)]); + script.childNodes.should.eql([ new TextNode('1', script) ]); script.text.should.eql('1'); style.childNodes.should.not.be.empty; - style.childNodes.should.eql([new TextNode('2&', style)]); + style.childNodes.should.eql([ new TextNode('2&', style) ]); style.text.should.eql('2&'); style.rawText.should.eql('2&'); }); @@ -198,11 +198,16 @@ describe('HTML Parser', function () { describe('#removeWhitespace()', function () { it('should remove whitespaces while preserving nodes with content', function () { - const root = parseHTML('

\r \n \t

123

'); + const root = parseHTML('

\r \n \t

123

'); + + const textNode = new TextNode(' 123 '); + textNode.rawText = textNode.trimmedText; + textNode.rawText.should.eql(' 123 '); const p = new HTMLElement('p', {}, '', root); - p.appendChild(new HTMLElement('h5', {}, '')) - .appendChild(Object.assign(new TextNode('123'), { _trimmedText: '123' })); + p + .appendChild(new HTMLElement('h5', {}, '')) + .appendChild(textNode); root.firstChild.removeWhitespace().should.eql(p); });