-
-
Notifications
You must be signed in to change notification settings - Fork 23
/
urlFinder.js
66 lines (61 loc) · 1.67 KB
/
urlFinder.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
/*eslint-env node */
'use strict';
const jsdom = require('jsdom');
const { JSDOM } = jsdom;
module.exports = async function ({ _distDir, visit }) {
let urls = [
'/',
// '/timetable',
//'/podcasts',
'/about',
'/subscribe',
'/dj-inquiry',
'/coc',
//'/djs',
'/chat',
'/sign-up',
'/forum',
'/wiki',
'/shows',
'/support'
];
const isValidUrl = (aTag) => {
return !aTag.hostname &&
!urls.includes(aTag.href.toLowerCase()) &&
/^(\/)+\S+$/.test(aTag.href) &&
!aTag.href.toLowerCase().includes('?');
};
// need to recursively crawl all the links on every page somehow
for (const url of urls) {
let page = await visit(url);
if (page.statusCode === 200) {
let html = await page.html();
let dom = new JSDOM(html);
for (let aTag of [...dom.window.document.querySelectorAll('a')]) {
if (aTag.href) {
if (isValidUrl(aTag)) {
urls.push(aTag.href.toLowerCase());
}
}
}
if (['/podcasts', '/forum', '/wiki', '/shows'].includes(url)) {
for (let aTag of [...dom.window.document.querySelectorAll('span.pagination a')]) {
page = await visit(aTag.href);
if (page.statusCode === 200) {
let html = await page.html();
let dom = new JSDOM(html);
for (let aTag of [...dom.window.document.querySelectorAll('a')]) {
if (aTag.href) {
if (isValidUrl(aTag)) {
urls.push(aTag.href.toLowerCase());
}
}
}
}
}
}
}
}
console.log('url count: ', urls.length);
return urls;
};