-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.js
109 lines (105 loc) · 3.28 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
const async = require("async");
const request = require("request");
const fs = require('fs');
var wstream = fs.createWriteStream('output.txt');
let crawl = function (options = {
seed : new Set(["https://medium.com/"]),
visited : new Set(),
seedForNext : new Set(),
iteration : 1,
depthLimit : 5,
cocurentConnectionLimit : 5
}, cb = () => { }) {
let {
seed,
visited,
seedForNext,
iteration,
depthLimit,
cocurentConnectionLimit
} = options;
console.log("iteration #" + iteration);
async.parallelLimit(Array.from(seed).map((url, i) => {
return cb => {
visited.add(url);
console.log(`Fetching ${i + 1} of ${seed.size}: ${url}`);
request({
method: "GET",
uri: url,
followRedirect: false
}, (error, response, body) => {
if (error){
console.log(new Error(error));
}
else if (response.statusCode == 200) {
wstream.write(url + "\n");
seedForNext = union(seedForNext, filterUrls(fetchUrlsfromHTML(body), url, visited));
}
else {
console.log("statusCode", response.statusCode, url);
if (response.headers.location)
seedForNext = union(seedForNext, filterUrls([response.headers.location], url, visited));
}
cb();
});
};
}),
cocurentConnectionLimit,
(error, result) => {
if (error) {
cb(error);
}
else {
seed = seedForNext;
seedForNext = new Set();
iteration++;
if (seed.size && iteration <= depthLimit) {
crawl({ seed, visited, seedForNext, iteration, depthLimit, cocurentConnectionLimit }, cb);
}
else if (iteration > depthLimit) {
console.log("max Depth reached");
cb(null, visited);
}
else {
console.log("no more urls found");
cb(null, visited);
}
}
});
};
function fetchUrlsfromHTML(body) {
let m;
const regex = /<a[^>]*href="([^"]*)"/gmi;
let urls = [];
while ((m = regex.exec(body)) !== null) {
if (m.index === regex.lastIndex) {
regex.lastIndex++;
}
let url;
try {
url = new URL(m[1]);
urls.push(url.href);
}
catch (ex) { }
}
return urls;
}
function filterUrls(list, href, visited) {
let urlObj = new URL(href);
let finalList = list
.map(url => new URL(url, href))
//Should be internal url
.filter(url => url.origin === urlObj.origin)
.map(url => url.origin + url.pathname)
//Should not be in visited urls list
.filter(url => !visited.has(url))
;
return new Set(finalList);
}
function union(setA, setB) {
for (var elem of setB) {
setA.add(elem);
}
return setA;
}
module.exports = { crawl };