-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
36 lines (28 loc) · 1.08 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os
import csv
from datetime import datetime
from requests_html import HTMLSession
def get_sources():
sources = []
with open("../newsprioritiestoday-data/sources.csv", "r") as file:
reader = csv.reader(file, delimiter=",")
header = next(reader)
for row in reader:
entry = {}
for index, item in enumerate(header):
entry[item] = row[index]
sources.append(entry)
return sources
def fetch_news(source):
print("Fetching news from " + source["name"])
session = HTMLSession()
r = session.get(source["url"])
target_directory = "../newsprioritiestoday-data/raw/" + source["directory"]
if not os.path.exists(target_directory):
print("Path for " + source["name"] + "does not exist. Creating path.")
os.makedirs(target_directory)
with open(target_directory + "/" + source["directory"] + "_" + str(datetime.now().strftime("%Y-%m-%d %H")) + "h.html", "w") as file:
file.write(r.html.raw_html.decode())
sources = get_sources()
for source in sources:
fetch_news(source)