tester.py

#!user/bin/python

# simple test URL: http://classics.mit.edu/index.html
# complex test URL: https://www.google.com/maps/place/Carnegie+Mellon+University+,+Pittsburgh+,+PA'
# this is to test for a way for me to extract address from google maps
# exec(open(r'c:\users\herman\gpe\usnews\tester.py','r').read())

# current situation: it seems like the request response, after soup
# straining, is the html I see on Chrome's elementer inspector, hidden
# inside some <script></script> mechanism. Apparently, these represent
# javascript executable script. As a result, what I'm looking for is
# generated by js. I will need selenium to open it.

# Target: to find and extract address contained in
# <div class="widget-pane-section-header-description">, which are two
# <h2> tags.

import sys
import urllib
import urllib.request
from bs4 import BeautifulSoup

url = r'http://classics.mit.edu/index.html'
complexu = r'https://www.google.com/maps/place/'
ghpurl = r'https://www.google.com/search?q=address+for+'+'Tufts+University+,+Medford,+MA'

user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'
headers = { 'User-Agent' : user_agent }

f = open(r'c:\users\herman\gpe\usnews\htmlcmu.txt', 'wb')

def main():
  req = urllib.request.Request(ghpurl,data=None, headers=headers)
  with urllib.request.urlopen(req) as request:
    html = request.read()
  soup = BeautifulSoup(html, 'html.parser')
  result = soup.find('div', class_='_eF').string
  
  print(result)
  f.write(result.encode('utf-8'))
  f.close()
  sys.exit(1)

if __name__ == '__main__':
  main()