-
Notifications
You must be signed in to change notification settings - Fork 79
/
links.ts
142 lines (129 loc) 路 3.5 KB
/
links.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import {type Readable} from 'node:stream';
import {WritableStream} from 'htmlparser2/lib/WritableStream';
import {parseSrcset} from 'srcset';
const linksAttribute: Record<string, string[]> = {
background: ['body'],
cite: ['blockquote', 'del', 'ins', 'q'],
data: ['object'],
href: ['a', 'area', 'embed', 'link'],
icon: ['command'],
longdesc: ['frame', 'iframe'],
manifest: ['html'],
content: ['meta'],
poster: ['video'],
pluginspage: ['embed'],
pluginurl: ['embed'],
src: [
'audio',
'embed',
'frame',
'iframe',
'img',
'input',
'script',
'source',
'track',
'video',
],
srcset: ['img', 'source'],
};
// Create lookup table for tag name to attribute that contains URL:
const tagAttribute: Record<string, string[]> = {};
for (const attribute of Object.keys(linksAttribute)) {
for (const tag of linksAttribute[attribute]) {
tagAttribute[tag] ||= [];
tagAttribute[tag].push(attribute);
}
}
export type ParsedUrl = {
link: string;
error?: Error;
url?: URL;
};
export async function getLinks(
source: Readable,
baseUrl: string,
): Promise<ParsedUrl[]> {
let realBaseUrl = baseUrl;
let baseSet = false;
const links = new Array<ParsedUrl>();
const parser = new WritableStream({
onopentag(tag: string, attributes: Record<string, string>) {
// Allow alternate base URL to be specified in tag:
if (tag === 'base' && !baseSet) {
realBaseUrl = getBaseUrl(attributes.href, baseUrl);
baseSet = true;
}
// ignore href properties for link tags where rel is likely to fail
// eslint-disable-next-line unicorn/prevent-abbreviations
const relValuesToIgnore = ['dns-prefetch', 'preconnect'];
if (tag === 'link' && relValuesToIgnore.includes(attributes.rel)) {
return;
}
// Only for <meta content=""> tags, only validate the url if
// the content actually looks like a url
if (tag === 'meta' && attributes.content) {
try {
// eslint-disable-next-line no-new
new URL(attributes.content);
} catch {
return;
}
}
if (tagAttribute[tag]) {
for (const attribute of tagAttribute[tag]) {
const linkString = attributes[attribute];
if (linkString) {
for (const link of parseAttribute(attribute, linkString)) {
links.push(parseLink(link, realBaseUrl));
}
}
}
}
},
});
await new Promise((resolve, reject) => {
source.pipe(parser).on('finish', resolve).on('error', reject);
});
return links;
}
function getBaseUrl(htmlBaseUrl: string, oldBaseUrl: string): string {
if (isAbsoluteUrl(htmlBaseUrl)) {
return htmlBaseUrl;
}
const url = new URL(htmlBaseUrl, oldBaseUrl);
url.hash = '';
return url.href;
}
function isAbsoluteUrl(url: string): boolean {
// Don't match Windows paths
if (/^[a-zA-Z]:\\/.test(url)) {
return false;
}
// Scheme: https://tools.ietf.org/html/rfc3986#section-3.1
// Absolute URL: https://tools.ietf.org/html/rfc3986#section-4.3
return /^[a-zA-Z][a-zA-Z\d+\-.]*:/.test(url);
}
function parseAttribute(name: string, value: string): string[] {
switch (name) {
case 'srcset': {
// The swapping of any multiple spaces into a single space is here to
// work around this bug:
// https://github.com/sindresorhus/srcset/issues/14
const strippedValue = value.replace(/\s+/, ' ');
return parseSrcset(strippedValue).map((p) => p.url);
}
default: {
return [value];
}
}
}
function parseLink(link: string, baseUrl: string): ParsedUrl {
try {
const url = new URL(link, baseUrl);
url.hash = '';
return {link, url};
} catch (error) {
return {link, error: error as Error};
}
}