-
Notifications
You must be signed in to change notification settings - Fork 79
/
index.ts
200 lines (182 loc) 路 5.54 KB
/
index.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import { EventEmitter } from 'events';
import * as gaxios from 'gaxios';
import * as http from 'http';
import enableDestroy = require('server-destroy');
import { getLinks } from './links';
const finalhandler = require('finalhandler');
const serveStatic = require('serve-static');
export interface CheckOptions {
port?: number;
path: string;
recurse?: boolean;
linksToSkip?: string[];
}
export enum LinkState {
OK = 'OK',
BROKEN = 'BROKEN',
SKIPPED = 'SKIPPED',
}
export interface LinkResult {
url: string;
status?: number;
state: LinkState;
}
export interface CrawlResult {
passed: boolean;
links: LinkResult[];
}
interface CrawlOptions {
url: string;
crawl: boolean;
results: LinkResult[];
cache: Set<string>;
checkOptions: CheckOptions;
}
/**
* Instance class used to perform a crawl job.
*/
export class LinkChecker extends EventEmitter {
/**
* Crawl a given url or path, and return a list of visited links along with
* status codes.
* @param options Options to use while checking for 404s
*/
async check(options: CheckOptions) {
options.linksToSkip = options.linksToSkip || [];
options.linksToSkip.push('^mailto:');
let server: http.Server | undefined;
if (!options.path.startsWith('http')) {
const port = options.port || 5000 + Math.round(Math.random() * 1000);
server = await this.startWebServer(options.path, port);
enableDestroy(server);
options.path = `http://localhost:${port}`;
}
const results = await this.crawl({
url: options.path,
crawl: true,
checkOptions: options,
results: [],
cache: new Set(),
});
const result = {
links: results,
passed: results.filter(x => x.state === LinkState.BROKEN).length === 0,
};
if (server) {
server.destroy();
}
return result;
}
/**
* Spin up a local HTTP server to serve static requests from disk
* @param root The local path that should be mounted as a static web server
* @param port The port on which to start the local web server
* @private
* @returns Promise that resolves with the instance of the HTTP server
*/
private startWebServer(root: string, port: number): Promise<http.Server> {
return new Promise((resolve, reject) => {
const serve = serveStatic(root);
const server = http
.createServer((req, res) => serve(req, res, finalhandler(req, res)))
.listen(port, () => resolve(server))
.on('error', reject);
});
}
/**
* Crawl a given url with the provided options.
* @pram opts List of options used to do the crawl
* @private
* @returns A list of crawl results consisting of urls and status codes
*/
private async crawl(opts: CrawlOptions): Promise<LinkResult[]> {
// Check to see if we've already scanned this url
if (opts.cache.has(opts.url)) {
return opts.results;
}
opts.cache.add(opts.url);
// Check for links that should be skipped
const skips = opts.checkOptions
.linksToSkip!.map(linkToSkip => {
return new RegExp(linkToSkip).test(opts.url);
})
.filter(match => !!match);
if (skips.length > 0) {
const result: LinkResult = { url: opts.url, state: LinkState.SKIPPED };
opts.results.push(result);
this.emit('link', result);
return opts.results;
}
// Perform a HEAD or GET request based on the need to crawl
let status = 0;
let state = LinkState.BROKEN;
let data = '';
let shouldRecurse = false;
try {
let res = await gaxios.request<string>({
method: opts.crawl ? 'GET' : 'HEAD',
url: opts.url,
responseType: opts.crawl ? 'text' : 'stream',
validateStatus: () => true,
});
// If we got an HTTP 405, the server may not like HEAD. GET instead!
if (res.status === 405) {
res = await gaxios.request<string>({
method: 'GET',
url: opts.url,
responseType: 'stream',
validateStatus: () => true,
});
}
// Assume any 2xx status is 馃憣
status = res.status;
if (res.status >= 200 && res.status < 300) {
state = LinkState.OK;
}
data = res.data;
shouldRecurse = isHtml(res);
} catch (err) {
// request failure: invalid domain name, etc.
}
const result: LinkResult = { url: opts.url, status, state };
opts.results.push(result);
this.emit('link', result);
// If we need to go deeper, scan the next level of depth for links and crawl
if (opts.crawl && shouldRecurse) {
this.emit('pagestart', opts.url);
const urls = getLinks(data, opts.url);
for (const url of urls) {
// only crawl links that start with the same host
const crawl =
opts.checkOptions.recurse! && url.startsWith(opts.checkOptions.path);
await this.crawl({
url,
crawl,
cache: opts.cache,
results: opts.results,
checkOptions: opts.checkOptions,
});
}
}
// Return the aggregate results
return opts.results;
}
}
/**
* Convenience method to perform a scan.
* @param options CheckOptions to be passed on
*/
export async function check(options: CheckOptions) {
const checker = new LinkChecker();
const results = await checker.check(options);
return results;
}
/**
* Checks to see if a given source is HTML.
* @param {object} response Page response.
* @returns {boolean}
*/
function isHtml(response: gaxios.GaxiosResponse): boolean {
const contentType = response.headers['content-type'] || '';
return !!contentType.match('text/html');
}