/
irish_times.recipe
115 lines (103 loc) · 4.26 KB
/
irish_times.recipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
__license__ = 'GPL v3'
__copyright__ = "2008, Derry FitzGerald. 2009 Modified by Ray Kinsella and David O'Callaghan, 2011 Modified by Phil Burns, 2013 Tom Scholl, 2016 by leo738"
'''
irishtimes.com
'''
import json
from uuid import uuid4
from mechanize import Request
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
from calibre.web.feeds.news import BasicNewsRecipe, classes
class IrishTimes(BasicNewsRecipe):
title = u'The Irish Times'
__author__ = "Derry FitzGerald, Ray Kinsella, David O'Callaghan and Phil Burns, Tom Scholl"
description = 'Daily news from The Irish Times'
needs_subscription = True
language = 'en_IE'
masthead_url = 'http://www.irishtimes.com/assets/images/generic/website/logo_theirishtimes.png'
encoding = 'utf-8'
oldest_article = 1.0
max_articles_per_feed = 100
simultaneous_downloads = 5
remove_empty_feeds = True
no_stylesheets = True
temp_files = []
keep_only_tags = [
dict(name=['h1', 'h2']),
classes('lead-art-wrapper article-body-wrapper'),
]
remove_tags = [
dict(name='button')
]
remove_attributes = ['width', 'height']
def parse_index(self):
soup = self.index_to_soup('https://www.irishtimes.com/')
section = 'Home page'
articles = []
feeds = []
for x in soup.findAll(name=['h3', 'article']):
if x.name == 'h3':
if 'writer_description' in x.get('class') or '':
continue
articles and feeds.append((section, articles))
section = self.tag_to_string(x)
articles = []
self.log('Section:', section)
continue
a = x.find('a', attrs={'class': lambda x: x and 'primary-font'}, href=True)
if a is None:
a = x.find('a', attrs={'class': lambda x: x and 'promo-headline' in x}, href=True)
if a:
q = ''.join(a['class'])
if 'secondary-font' in q and section == 'Home page':
continue
title = self.tag_to_string(a)
url = a['href']
if url.startswith('/'):
url = 'https://www.irishtimes.com' + url
articles.append({'title': title, 'url': url})
self.log('\t', title)
articles and feeds.append((section, articles))
return feeds
def get_browser(self):
return super().get_browser()
# To understand the signin logic read signin javascript from submit button from
# https://www.irishtimes.com/signin
br = BasicNewsRecipe.get_browser(self, user_agent='curl/7.80.0')
ip_data = json.loads(br.open('https://ipapi.co//json').read())
br = BasicNewsRecipe.get_browser(self)
url = 'https://www.irishtimes.com/signin'
deviceid = str(uuid4()).replace('-', '')
# Enable debug stuff?
# br.set_debug_http(True)
br.open(url).read()
from pprint import pprint
pprint(ip_data)
br.set_cookie('IT_country', ip_data['country_code'], '.irishtimes.com')
br.set_cookie('IT_eu', 'true' if ip_data['in_eu'] else 'false', '.irishtimes.com')
rurl = 'https://www.irishtimes.com/auth-rest-api/v1/paywall/login'
rq = Request(rurl, headers={
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Origin': 'https://www.irishtimes.com',
'Referer': url,
'X-Requested-With': 'XMLHttpRequest',
'sec-fetch-site': 'same-origin',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
}, data=urlencode({'username': self.username, 'password': self.password, 'deviceid':deviceid, 'persistent':'on', 'rid': ''}))
r = br.open(rq)
raw = r.read()
data = json.loads(raw)
# print(data)
if r.code != 200 or b'user_id' not in raw:
pprint(data)
raise ValueError('Failed to log in check username/password')
# Set cookie
br.set_cookie('IT_PW_AUTH', data['varnish_id'], '.irishtimes.com')
# br.set_debug_http(False)
return br