Skip to content

Commit

Permalink
fix(perf): use request streams to reduce memory usage (#336)
Browse files Browse the repository at this point in the history
  • Loading branch information
JustinBeckwith committed Oct 5, 2021
1 parent ce88410 commit 6e33b39
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 17 deletions.
23 changes: 12 additions & 11 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import {EventEmitter} from 'events';
import {URL} from 'url';
import * as http from 'http';
import * as path from 'path';
import {Readable} from 'stream';

import {request, GaxiosResponse} from 'gaxios';

Expand Down Expand Up @@ -222,16 +223,15 @@ export class LinkChecker extends EventEmitter {
// Perform a HEAD or GET request based on the need to crawl
let status = 0;
let state = LinkState.BROKEN;
let data = '';
let shouldRecurse = false;
let res: GaxiosResponse<string> | undefined = undefined;
let res: GaxiosResponse<Readable> | undefined = undefined;
const failures: {}[] = [];
try {
res = await request<string>({
res = await request<Readable>({
method: opts.crawl ? 'GET' : 'HEAD',
url: opts.url.href,
headers,
responseType: opts.crawl ? 'text' : 'stream',
responseType: 'stream',
validateStatus: () => true,
timeout: opts.checkOptions.timeout,
});
Expand All @@ -241,7 +241,7 @@ export class LinkChecker extends EventEmitter {

// If we got an HTTP 405, the server may not like HEAD. GET instead!
if (res.status === 405) {
res = await request<string>({
res = await request<Readable>({
method: 'GET',
url: opts.url.href,
headers,
Expand All @@ -257,7 +257,7 @@ export class LinkChecker extends EventEmitter {
// request failure: invalid domain name, etc.
// this also occasionally catches too many redirects, but is still valid (e.g. https://www.ebay.com)
// for this reason, we also try doing a GET below to see if the link is valid
failures.push(err);
failures.push(err as Error);
}

try {
Expand All @@ -266,10 +266,10 @@ export class LinkChecker extends EventEmitter {
(res === undefined || res.status < 200 || res.status >= 300) &&
!opts.crawl
) {
res = await request<string>({
res = await request<Readable>({
method: 'GET',
url: opts.url.href,
responseType: 'text',
responseType: 'stream',
validateStatus: () => true,
headers,
timeout: opts.checkOptions.timeout,
Expand All @@ -279,13 +279,12 @@ export class LinkChecker extends EventEmitter {
}
}
} catch (ex) {
failures.push(ex);
failures.push(ex as Error);
// catch the next failure
}

if (res !== undefined) {
status = res.status;
data = res.data;
shouldRecurse = isHtml(res);
}

Expand All @@ -309,7 +308,9 @@ export class LinkChecker extends EventEmitter {
// If we need to go deeper, scan the next level of depth for links and crawl
if (opts.crawl && shouldRecurse) {
this.emit('pagestart', opts.url);
const urlResults = getLinks(data, opts.url.href);
const urlResults = res?.data
? await getLinks(res.data, opts.url.href)
: [];
for (const result of urlResults) {
// if there was some sort of problem parsing the link while
// creating a new URL obj, treat it as a broken link.
Expand Down
17 changes: 11 additions & 6 deletions src/links.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import * as htmlParser from 'htmlparser2';
import * as htmlParser from 'htmlparser2/lib/WritableStream';
import {Readable} from 'stream';
import {URL} from 'url';

const linksAttr = {
Expand Down Expand Up @@ -42,11 +43,14 @@ export interface ParsedUrl {
url?: URL;
}

export function getLinks(source: string, baseUrl: string): ParsedUrl[] {
export async function getLinks(
source: Readable,
baseUrl: string
): Promise<ParsedUrl[]> {
let realBaseUrl = baseUrl;
let baseSet = false;
const links = new Array<ParsedUrl>();
const parser = new htmlParser.Parser({
const parser = new htmlParser.WritableStream({
onopentag(tag: string, attributes: {[s: string]: string}) {
// Allow alternate base URL to be specified in tag:
if (tag === 'base' && !baseSet) {
Expand Down Expand Up @@ -79,8 +83,9 @@ export function getLinks(source: string, baseUrl: string): ParsedUrl[] {
}
},
});
parser.write(source);
parser.end();
await new Promise((resolve, reject) => {
source.pipe(parser).on('finish', resolve).on('error', reject);
});
return links;
}

Expand Down Expand Up @@ -110,6 +115,6 @@ function parseLink(link: string, baseUrl: string): ParsedUrl {
url.hash = '';
return {link, url};
} catch (error) {
return {link, error};
return {link, error: error as Error};
}
}

0 comments on commit 6e33b39

Please sign in to comment.