-
-
Notifications
You must be signed in to change notification settings - Fork 231
/
index.ts
144 lines (126 loc) · 4.71 KB
/
index.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import type { Location } from 'parse5/dist/common/token.js';
import { SAXParser, EndTag, StartTag, Doctype, Text, Comment, SaxToken } from 'parse5-sax-parser';
import { hasUnescapedText, escapeString } from 'parse5/dist/serializer/index.js';
/**
* Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML rewriter.
* A [transform stream](https://nodejs.org/api/stream.html#stream_class_stream_transform) (which means you can pipe _through_ it, see example).
*
* The rewriter uses the raw source representation of tokens if they are not modified by the user. Therefore, the resulting
* HTML is not affected by parser error-recovery mechanisms as in a classical parsing-serialization roundtrip.
*
* @example
*
* ```js
* const RewritingStream = require('parse5-html-rewriting-stream');
* const http = require('http');
* const fs = require('fs');
*
* const file = fs.createWriteStream('/home/google.com.html');
* const rewriter = new RewritingStream();
*
* // Replace divs with spans
* rewriter.on('startTag', startTag => {
* if (startTag.tagName === 'span') {
* startTag.tagName = 'div';
* }
*
* rewriter.emitStartTag(startTag);
* });
*
* rewriter.on('endTag', endTag => {
* if (endTag.tagName === 'span') {
* endTag.tagName = 'div';
* }
*
* rewriter.emitEndTag(endTag);
* });
*
* // Wrap all text nodes with <i> tag
* rewriter.on('text', (_, raw) => {
* // Use raw representation of text without HTML entities decoding
* rewriter.emitRaw(`<i>${raw}</i>`);
* });
*
* http.get('http://google.com', res => {
* // Assumes response is UTF-8.
* res.setEncoding('utf8');
* // RewritingStream is the Transform stream, which means you can pipe
* // through it.
* res.pipe(rewriter).pipe(file);
* });
* ```
*/
export class RewritingStream extends SAXParser {
/** Note: The `sourceCodeLocationInfo` is always enabled. */
constructor() {
super({ sourceCodeLocationInfo: true });
}
override _transformChunk(chunk: string): string {
// NOTE: ignore upstream return value as we want to push to
// the Writable part of Transform stream ourselves.
super._transformChunk(chunk);
return '';
}
private _getRawHtml(location: Location): string {
const { droppedBufferSize, html } = this.tokenizer.preprocessor;
const start = location.startOffset - droppedBufferSize;
const end = location.endOffset - droppedBufferSize;
return html.slice(start, end);
}
// Events
protected override emitIfListenerExists(eventName: string, token: SaxToken): boolean {
if (!super.emitIfListenerExists(eventName, token)) {
this.emitRaw(this._getRawHtml(token.sourceCodeLocation!));
}
// NOTE: don't skip new lines after <pre> and other tags,
// otherwise we'll have incorrect raw data.
this.parserFeedbackSimulator.skipNextNewLine = false;
return true;
}
// Emitter API
protected override _emitToken(eventName: string, token: SaxToken): void {
this.emit(eventName, token, this._getRawHtml(token.sourceCodeLocation!));
}
/** Emits serialized document type token into the output stream. */
public emitDoctype(token: Doctype): void {
let res = `<!DOCTYPE ${token.name}`;
if (token.publicId !== null) {
res += ` PUBLIC "${token.publicId}"`;
} else if (token.systemId !== null) {
res += ' SYSTEM';
}
if (token.systemId !== null) {
res += ` "${token.systemId}"`;
}
res += '>';
this.push(res);
}
/** Emits serialized start tag token into the output stream. */
public emitStartTag(token: StartTag): void {
const res = token.attrs.reduce(
(res, attr) => `${res} ${attr.name}="${escapeString(attr.value, true)}"`,
`<${token.tagName}`
);
this.push(res + (token.selfClosing ? '/>' : '>'));
}
/** Emits serialized end tag token into the output stream. */
public emitEndTag(token: EndTag): void {
this.push(`</${token.tagName}>`);
}
/** Emits serialized text token into the output stream. */
public emitText({ text }: Text): void {
this.push(
!this.parserFeedbackSimulator.inForeignContent && hasUnescapedText(this.tokenizer.lastStartTagName, true)
? text
: escapeString(text, false)
);
}
/** Emits serialized comment token into the output stream. */
public emitComment(token: Comment): void {
this.push(`<!--${token.text}-->`);
}
/** Emits raw HTML string into the output stream. */
public emitRaw(html: string): void {
this.push(html);
}
}