-
Notifications
You must be signed in to change notification settings - Fork 0
/
tester.py
45 lines (35 loc) · 1.58 KB
/
tester.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!user/bin/python
# simple test URL: http://classics.mit.edu/index.html
# complex test URL: https://www.google.com/maps/place/Carnegie+Mellon+University+,+Pittsburgh+,+PA'
# this is to test for a way for me to extract address from google maps
# exec(open(r'c:\users\herman\gpe\usnews\tester.py','r').read())
# current situation: it seems like the request response, after soup
# straining, is the html I see on Chrome's elementer inspector, hidden
# inside some <script></script> mechanism. Apparently, these represent
# javascript executable script. As a result, what I'm looking for is
# generated by js. I will need selenium to open it.
# Target: to find and extract address contained in
# <div class="widget-pane-section-header-description">, which are two
# <h2> tags.
import sys
import urllib
import urllib.request
from bs4 import BeautifulSoup
url = r'http://classics.mit.edu/index.html'
complexu = r'https://www.google.com/maps/place/'
ghpurl = r'https://www.google.com/search?q=address+for+'+'Tufts+University+,+Medford,+MA'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'
headers = { 'User-Agent' : user_agent }
f = open(r'c:\users\herman\gpe\usnews\htmlcmu.txt', 'wb')
def main():
req = urllib.request.Request(ghpurl,data=None, headers=headers)
with urllib.request.urlopen(req) as request:
html = request.read()
soup = BeautifulSoup(html, 'html.parser')
result = soup.find('div', class_='_eF').string
print(result)
f.write(result.encode('utf-8'))
f.close()
sys.exit(1)
if __name__ == '__main__':
main()