/
index.ts
177 lines (155 loc) · 5.77 KB
/
index.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import { html, type Token } from 'parse5';
import {
SAXParser,
type EndTag,
type StartTag,
type Doctype,
type Text,
type Comment,
type SaxToken,
} from 'parse5-sax-parser';
import { escapeText, escapeAttribute } from 'entities';
/**
* Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML rewriter.
* A [transform stream](https://nodejs.org/api/stream.html#stream_class_stream_transform) (which means you can pipe _through_ it, see example).
*
* The rewriter uses the raw source representation of tokens if they are not modified by the user. Therefore, the resulting
* HTML is not affected by parser error-recovery mechanisms as in a classical parsing-serialization roundtrip.
*
* @example
*
* ```js
* const RewritingStream = require('parse5-html-rewriting-stream');
* const http = require('http');
* const fs = require('fs');
*
* const file = fs.createWriteStream('/home/google.com.html');
* const rewriter = new RewritingStream();
*
* // Replace divs with spans
* rewriter.on('startTag', startTag => {
* if (startTag.tagName === 'span') {
* startTag.tagName = 'div';
* }
*
* rewriter.emitStartTag(startTag);
* });
*
* rewriter.on('endTag', endTag => {
* if (endTag.tagName === 'span') {
* endTag.tagName = 'div';
* }
*
* rewriter.emitEndTag(endTag);
* });
*
* // Wrap all text nodes with an <i> tag
* rewriter.on('text', (_, raw) => {
* // Use the raw representation of text without HTML entities decoding
* rewriter.emitRaw(`<i>${raw}</i>`);
* });
*
* http.get('http://google.com', res => {
* // Assumes response is UTF-8.
* res.setEncoding('utf8');
* // `RewritingStream` is a `Transform` stream, which means you can pipe
* // through it.
* res.pipe(rewriter).pipe(file);
* });
* ```
*/
export class RewritingStream extends SAXParser {
/** Note: `sourceCodeLocationInfo` is always enabled. */
constructor() {
super({ sourceCodeLocationInfo: true });
}
override _transformChunk(chunk: string): string {
// NOTE: ignore upstream return values as we want to push to
// the `Writable` part of the `Transform` stream ourselves.
super._transformChunk(chunk);
return '';
}
private _getRawHtml(location: Token.Location): string {
const { droppedBufferSize, html } = this.tokenizer.preprocessor;
const start = location.startOffset - droppedBufferSize;
const end = location.endOffset - droppedBufferSize;
return html.slice(start, end);
}
// Events
protected override emitIfListenerExists(eventName: string, token: SaxToken): boolean {
if (!super.emitIfListenerExists(eventName, token)) {
this.emitRaw(this._getRawHtml(token.sourceCodeLocation!));
}
// NOTE: don't skip new lines after `<pre>` and other tags,
// otherwise we'll have incorrect raw data.
this.parserFeedbackSimulator.skipNextNewLine = false;
return true;
}
// Emitter API
protected override _emitToken(eventName: string, token: SaxToken): void {
this.emit(eventName, token, this._getRawHtml(token.sourceCodeLocation!));
}
/** Emits a serialized document type token into the output stream. */
public emitDoctype(token: Doctype): void {
let res = `<!DOCTYPE ${token.name}`;
if (token.publicId !== null) {
res += ` PUBLIC "${token.publicId}"`;
} else if (token.systemId !== null) {
res += ' SYSTEM';
}
if (token.systemId !== null) {
res += ` "${token.systemId}"`;
}
res += '>';
this.push(res);
}
/** Emits a serialized start tag token into the output stream. */
public emitStartTag(token: StartTag): void {
let res = `<${token.tagName}`;
for (const attr of token.attrs) {
res += ` ${attr.name}="${escapeAttribute(attr.value)}"`;
}
res += token.selfClosing ? '/>' : '>';
this.push(res);
}
/** Emits a serialized end tag token into the output stream. */
public emitEndTag(token: EndTag): void {
this.push(`</${token.tagName}>`);
}
/** Emits a serialized text token into the output stream. */
public emitText({ text }: Text): void {
this.push(
!this.parserFeedbackSimulator.inForeignContent &&
html.hasUnescapedText(this.tokenizer.lastStartTagName, true)
? text
: escapeText(text)
);
}
/** Emits a serialized comment token into the output stream. */
public emitComment(token: Comment): void {
this.push(`<!--${token.text}-->`);
}
/** Emits a raw HTML string into the output stream. */
public emitRaw(html: string): void {
this.push(html);
}
}
export interface RewritingStream {
/** Raised when the rewriter encounters a start tag. */
on(event: 'startTag', listener: (startTag: StartTag, rawHtml: string) => void): this;
/** Raised when rewriter encounters an end tag. */
on(event: 'endTag', listener: (endTag: EndTag, rawHtml: string) => void): this;
/** Raised when rewriter encounters a comment. */
on(event: 'comment', listener: (comment: Comment, rawHtml: string) => void): this;
/** Raised when rewriter encounters text content. */
on(event: 'text', listener: (text: Text, rawHtml: string) => void): this;
/** Raised when rewriter encounters a [document type declaration](https://en.wikipedia.org/wiki/Document_type_declaration). */
on(event: 'doctype', listener: (doctype: Doctype, rawHtml: string) => void): this;
/**
* Base event handler.
*
* @param event Name of the event
* @param handler Event handler
*/
on(event: string, handler: (...args: any[]) => void): this;
}