├── NCDC-NOAA-wget.sh
├── GLERLNOAA-wget.sh
├── EnergyAutomatedRegister-wget.sh
├── GoogleDirectionsBuildURL-API.py
├── Kalamazoo-BS.py
├── GoogleSearch-dry.py
├── CrunchBase-API.py
├── Grimm-BS.py
├── FRMinutesDiscoutRate-BS.py
├── GLORecords-dry.py
├── GoogleGeocodeSearches.py
├── OpenSecrets-API.py
├── Perrault-BS.py
├── SFPlanning-BS.py
├── WebofScience-API.py
├── LICENSE
├── DataSFFireIncidents-API.r
├── GlassDoor-API.py
├── NationalStolenArtFile-BS.py
├── WikipediaRevisionHistory-API.py
├── INOCAR-selenium.py
├── LucidChart-BS.py
├── MRIPNOAA-selenium.py
├── .gitignore
├── Kiva-API.py
├── BGG-BS.py
├── BoardGameCapital-selenium.py
├── IMSDB-BS.py
├── NHSTrustsInfo-BS.py
├── INOCAR-AJAX.py
├── GoogleGeoLatLong-API.py
├── LARRP-BS.py
├── Wiktionary-API.py
├── ADA-ERP-BS.py
├── RioGrandeGames-selenium.py
├── STNMFSNOAA-BS.py
├── BAAD-BS.py
├── README.md
├── BSBDigitaleSammlungen-API.py
├── AHA-selenium.py
├── ResidentAdvisor-selenium.py
├── CTSNet-selenium.py
├── Doximity-selenium.py
├── RateMyProfessors-selenium.py
├── DataMartBasicSkills-req.py
└── PACER-selenium.py


/NCDC-NOAA-wget.sh:
--------------------------------------------------------------------------------
1 | wget -r ftp://data.ncdc.noaa.gov/cdr/solar-irradiance/tsi/


--------------------------------------------------------------------------------
/GLERLNOAA-wget.sh:
--------------------------------------------------------------------------------
1 | wget -r --no-parent https://www.glerl.noaa.gov//metdata/status/status_archive/


--------------------------------------------------------------------------------
/EnergyAutomatedRegister-wget.sh:
--------------------------------------------------------------------------------
1 | wget https://www.energy.gov/eere/downloads/automated-register-implemented-actions
2 | mv automated-register-implemented-actions automated-register-implemented-actions.html
3 | wget https://www.energy.gov/sites/prod/files/2016/07/f33/Automated%20Register%20V1.0.2.xlsx
4 | wget https://www.energy.gov/sites/prod/files/2016/07/f33/Automated%20Register%20V1.0.2%20User%20Manual.pdf


--------------------------------------------------------------------------------
/GoogleDirectionsBuildURL-API.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | base = "https://www.google.com/maps/dir/"
 4 | 
 5 | locs = [
 6 |     "305 Harrison St, seattle, WA 98109, USA",
 7 |     "san francisco airport",
 8 |     "UC berkeley",
 9 |     "stanford university"]
10 | 
11 | # API STUFF HERE
12 | 
13 | ordered_locs = []
14 | 
15 | final_url = base
16 | 
17 | for l in locs:
18 |     final_url += '+'.join(l.split()) + "/"
19 | 
20 | print(final_url)
21 | 


--------------------------------------------------------------------------------
/Kalamazoo-BS.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import Request, urlopen
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | urls = []
 5 | for i in range(1035, 1053):
 6 |     urls.append(
 7 |         "http://scholarworks.wmich.edu/cgi/viewcontent.cgi?article=" +
 8 |         str(i) +
 9 |         "&context=medieval_cong_archive")
10 | 
11 | for i, url in enumerate(urls):
12 |     res = urlopen(Request(url))
13 |     pdf = open(("kzoo/kalamazoo_" + str(i) + ".pdf"), 'wb')
14 |     pdf.write(res.read())
15 |     pdf.close()
16 | 


--------------------------------------------------------------------------------
/GoogleSearch-dry.py:
--------------------------------------------------------------------------------
 1 | import dryscrape
 2 | import sys
 3 | 
 4 | 
 5 | search_term = 'testing'
 6 | 
 7 | # set up a web scraping session
 8 | sess = dryscrape.Session(base_url='http://google.com')
 9 | 
10 | # we don't need images
11 | sess.set_attribute('auto_load_images', False)
12 | 
13 | # visit homepage and search for a term
14 | sess.visit('/')
15 | q = sess.at_xpath('//*[@name="q"]')
16 | q.set(search_term)
17 | q.form().submit()
18 | 
19 | # extract all links
20 | for link in sess.xpath('//a[@href]'):
21 |     print(link)
22 |     print(link['href'])
23 | 
24 | # # save a screenshot of the web page
25 | # sess.render('google.png')
26 | # print("Screenshot written to 'google.png'")
27 | 


--------------------------------------------------------------------------------
/CrunchBase-API.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | from __future__ import division
 4 | import math
 5 | import csv
 6 | 
 7 | # set key
 8 | key = "PUT_KEY_HERE"
 9 | 
10 | # set base url
11 | base_url = "https://api.crunchbase.com/v/3/organizations"
12 | 
13 | # set response format
14 | response_format = ".json"
15 | 
16 | # set search parameters
17 | search_params = {"name": "uber",
18 |                  "user_key": key,
19 |                  "page": "1"}
20 | 
21 | # make request
22 | r = requests.get(base_url + response_format, params=search_params)
23 | response_text = r.text
24 | 
25 | # Convert JSON response to a dictionary
26 | data = json.loads(response_text)
27 | 
28 | print(data.keys())
29 | 


--------------------------------------------------------------------------------
/Grimm-BS.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | import pandas as pd
 5 | 
 6 | soup = BeautifulSoup(requests.get(
 7 |     "https://www.cs.cmu.edu/~spok/grimmtmp/").text, 'html5lib')
 8 | 
 9 | titles = [x.text.strip() for x in soup.find_all("li")]
10 | 
11 | base = 'https://www.cs.cmu.edu/~spok/grimmtmp/'
12 | rows = []
13 | 
14 | for i in range(1, 210):
15 | 
16 |     url = 'https://www.cs.cmu.edu/~spok/grimmtmp/{}.txt'.format(
17 |         str(i).zfill(3))
18 | 
19 |     text = requests.get(url).text.strip()
20 | 
21 |     rows.append([titles[i - 1], text])
22 | 
23 |     time.sleep(1)
24 | 
25 | df = pd.DataFrame(rows, columns=['Title', 'Text'])
26 | df.to_csv("grimm.csv", index=False)
27 | 


--------------------------------------------------------------------------------
/FRMinutesDiscoutRate-BS.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import Request, urlopen
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | 
 5 | html = urlopen("http://www.federalreserve.gov/monetarypolicy/discountrate.htm")
 6 | bsObj = BeautifulSoup(html.read(), "lxml")
 7 | d1 = bsObj.findAll("option")
 8 | 
 9 | urls = []
10 | for item in d1:
11 |     if "PDF" in str(item.get_text()):
12 |         prefix = "http://www.federalreserve.gov"
13 |         url = prefix + str(item['value'])
14 |         urls.append((url, str(item.get_text())))
15 | 
16 | urls = urls[:3]
17 | 
18 | print(len(urls))
19 | 
20 | for url in urls:
21 |     res = urlopen(Request(url[0]))
22 |     pdf = open((url[1] + ".pdf"), 'wb')
23 |     pdf.write(res.read())
24 |     pdf.close()
25 | 


--------------------------------------------------------------------------------
/GLORecords-dry.py:
--------------------------------------------------------------------------------
 1 | import dryscrape
 2 | import sys
 3 | from urllib.request import Request, urlopen
 4 | from bs4 import BeautifulSoup
 5 | import time
 6 | 
 7 | 
 8 | urls = ["http://www.glorecords.blm.gov"]
 9 | ext = "/ConvertedImages/CV_Patent_0123-207.PDF"
10 | 
11 | for url in urls:
12 |     # set up a web scraping session
13 |     sess = dryscrape.Session(base_url=url)
14 | 
15 |     # we don't need images
16 |     sess.set_attribute('auto_load_images', True)
17 | 
18 |     # visit homepage and search for a term
19 |     sess.visit(ext)
20 |     time.sleep(15)
21 |     # sess.render('sshot.png')
22 | 
23 |     res = urlopen(Request(url + ext))
24 |     pdf = open((url[1] + ".pdf"), 'wb')
25 |     pdf.write(res.read())
26 |     pdf.close()
27 | 


--------------------------------------------------------------------------------
/GoogleGeocodeSearches.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import urllib
 3 | import time
 4 | 
 5 | searches = ['UC Berkeley', 'University of Minnesota', 'Middlebury College']
 6 | 
 7 | latitude = []
 8 | longitude = []
 9 | for s in searches:
10 |     search = urllib.parse.quote(s)
11 | 
12 |     print(s)
13 | 
14 |     try:
15 |         json_res = requests.get(
16 |             'https://maps.googleapis.com/maps/api/geocode/json?address={}'.format(search)).json()
17 |         coordinates = json_res['results'][0]['geometry']['location']
18 |         latitude.append(coordinates['lat'])
19 |         longitude.append(coordinates['lng'])
20 |     except:
21 |         latitude.append(None)
22 |         longitude.append(None)
23 | 
24 |     time.sleep(.5)
25 | 
26 | print(list(zip(latitude, longitude)))
27 | 


--------------------------------------------------------------------------------
/OpenSecrets-API.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from urllib.request import Request, urlopen
 3 | 
 4 | 
 5 | def getJson(func, apikey, params):
 6 |     url = 'http://www.opensecrets.org/api/?method=%s&output=json&%s&apikey=%s' % \
 7 |           (func, params, apikey)
 8 | 
 9 |     req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
10 | 
11 |     response = urlopen(req).read().decode('utf-8')
12 |     responseJson = json.loads(response)
13 | 
14 |     return responseJson
15 | 
16 | func = "getOrgs"
17 | apikey = ""
18 | params = "org=Exxon"
19 | 
20 | info = getJson(func, apikey, params)
21 | 
22 | print(info)
23 | 
24 | orgid = info.get("response").get("organization")[
25 |     0].get("@attributes").get("orgid")
26 | 
27 | func = "orgSummary"
28 | params = "id=" + orgid
29 | summary = getJson(func, apikey, params)
30 | 
31 | print(summary)
32 | 


--------------------------------------------------------------------------------
/Perrault-BS.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | import pandas as pd
 5 | 
 6 | rows = []
 7 | for i in range(1, 12):
 8 | 
 9 |     url = 'http://www.pitt.edu/~dash/perrault{}.html'.format(
10 |         str(i).zfill(2))
11 | 
12 |     soup = BeautifulSoup(requests.get(url).text, 'html5lib')
13 | 
14 |     title = soup.find('h1').text.strip()
15 |     text = '\n'.join([p.text for p in soup.find_all('p')[:-1]])
16 |     try:
17 |         text += soup.find('blockquote').text
18 |     except:
19 |         pass
20 | 
21 |     bullets = soup.find_all('li')
22 |     for b in bullets:
23 |         if "aarne" in b.text.lower():
24 |             at = ''.join([ch for ch in b.text if ch.isnumeric()])
25 | 
26 |     rows.append([title, at, text])
27 | 
28 |     time.sleep(1)
29 | 
30 | df = pd.DataFrame(rows, columns=['Title', 'Aarne-Thompson', 'Text'])
31 | df.to_csv("perrault.csv", index=False)
32 | 


--------------------------------------------------------------------------------
/SFPlanning-BS.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | 
 5 | all_links = []
 6 | res = requests.get('http://default.sfplanning.org/meetingarchive/planning_dept/sf-planning.org/index.aspx-page=1000.html')
 7 | soup = BeautifulSoup(res.text, 'lxml')
 8 | 
 9 | # build date links
10 | base = 'http://default.sfplanning.org/meetingarchive/planning_dept/sf-planning.org/'
11 | links = [base + a['href'] for a in soup.find('div', {'id': 'ctl00_content_Screen'})('a')]
12 | 
13 | # collect nested links
14 | for l in links:
15 |     res = requests.get(l)
16 |     soup = BeautifulSoup(res.text, 'lxml')
17 | 
18 |     links = [base + a['href'] for a in soup.find('div', {'id': 'ctl00_content_Screen'})('a')]
19 |     all_links.extend(links)
20 |     time.sleep(1)
21 | 
22 | # save HTML response for all links
23 | for l in all_links:
24 |     html = requests.get(l).text
25 |     name = l.split('=')[-1]
26 |     print(name)
27 |     with open('sfplanning/' + name, 'w') as f:
28 |         f.write(html)
29 |     time.sleep(1)
30 | 


--------------------------------------------------------------------------------
/WebofScience-API.py:
--------------------------------------------------------------------------------
 1 | from wos import WosClient
 2 | import wos.utils
 3 | import time
 4 | 
 5 | # must be on campus with access
 6 | with WosClient('') as client:
 7 |     journals = ["Science"]
 8 |     years = range(2000, 2001)
 9 |     for journal in journals:
10 |         for year in years:
11 | 
12 |             rf = wos.utils.recordsFound(
13 |                 client, 'PY=' + str(year) + ' AND SO=' + journal)
14 | 
15 |             for num in range(1, rf, 100):
16 | 
17 |                 info = wos.utils.query(
18 |                     client,
19 |                     'PY=' +
20 |                     str(year) +
21 |                     ' AND SO=' +
22 |                     journal,
23 |                     count=100,
24 |                     frecord=num)
25 | 
26 |                 with open("data/" + str(year) + '-' + journal + ' ' + str(num) + ".xml", "w") as f:
27 |                     f.write(str(info.encode('utf-8')))
28 | 
29 |                 time.sleep(2)
30 | 
31 | # http://ipscience-help.thomsonreuters.com/wosWebServicesLite/WebServiceOperationsGroup/WebServiceOperations/g2/user_query/field_tags/WOSfieldTags.html
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Christopher Hench
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/DataSFFireIncidents-API.r:
--------------------------------------------------------------------------------
 1 | ##################################################
 2 | ## Project: Collect API data on fire incidents
 3 | ## Author: Christopher Hench
 4 | ##################################################
 5 | 
 6 | flatten_json <- function(df) {
 7 | 
 8 |     for (col in names(df)) {
 9 |         if (is.list(df[[col]])) {
10 |             i <- 1
11 |             for (row in df[[col]]) {
12 | 
13 |                 df[[col]][i] <- paste(row, collapse = '; ')
14 |                 i <- i + 1
15 |             }
16 |             df[[col]] <- unlist(df[[col]])
17 |         }
18 |     }
19 | return (df)
20 | }
21 | 
22 | base_url <- 'https://data.sfgov.org/resource/wbb6-uh78.json?'
23 | 
24 | incident_date <- '2017-10-22T00:00:00.000'
25 | incident_date <- URLencode(URL = incident_date, reserved = TRUE)
26 | 
27 | get_request <- paste0(base_url, "incident_date=", incident_date)
28 | print(get_request)
29 | 
30 | response <- httr::GET(url = get_request)
31 | response <- httr::content(x = response, as = "text")
32 | response_df <- data.frame(jsonlite::fromJSON(txt = response, simplifyDataFrame = TRUE, flatten = TRUE))
33 | 
34 | flattened <- flatten_json(response_df)
35 | 
36 | write.csv(flattened, file='fire-incidents.csv')


--------------------------------------------------------------------------------
/GlassDoor-API.py:
--------------------------------------------------------------------------------
 1 | # https://pypi.python.org/pypi/glassdoor
 2 | # http://stackoverflow.com/questions/30956891/rest-glassdoor-api-requires-user-agent-in-header
 3 | import urllib.request as request
 4 | import requests
 5 | import json
 6 | from collections import OrderedDict
 7 | 
 8 | # authentication information & other request parameters
 9 | params_gd = OrderedDict({
10 |     "v": "1",
11 |     "format": "json",
12 |     "t.p": "",
13 |     "t.k": "",
14 |     "action": "employers",
15 |     "employerID": "11111",
16 |     # programmatically get the IP of the machine
17 |     "userip": json.loads(request.urlopen("http://ip.jsontest.com/").read().decode('utf-8'))['ip'],
18 |     "useragent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36"
19 | })
20 | 
21 | # construct the URL from parameters
22 | basepath_gd = 'http://api.glassdoor.com/api/api.htm'
23 | 
24 | # request the API
25 | response_gd = requests.get(
26 |     basepath_gd, params=params_gd, headers={
27 |         "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36"})
28 | 
29 | # check the response code (should be 200)  & the content
30 | response_gd
31 | data = json.loads(response_gd.text)
32 | 
33 | print(data["response"]["employers"][0].keys())
34 | 


--------------------------------------------------------------------------------
/NationalStolenArtFile-BS.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | import pickle
 5 | import pandas as pd
 6 | 
 7 | headers = {
 8 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
 9 | artworks = []
10 | for i in range(0, 7200, 100):
11 |     print(i)
12 |     url = 'https://www.fbi.gov/investigate/violent-crime/art-theft/national-stolen-art-file?b_start:int=' + \
13 |         str(i)
14 |     res = requests.get(url, headers)
15 |     soup = BeautifulSoup(res.text, 'html5lib')
16 | 
17 |     for i in soup.find_all('li', {'class': 'grid-item'}):
18 | 
19 |         art = {}
20 |         art['title'] = i.find('h3').text
21 |         art['description'] = i.find('p').text
22 | 
23 |         try:
24 |             art['image_link'] = i.find('img')['src']
25 |         except:
26 |             art['image_link'] = 'None'
27 | 
28 |         keys = [x.text for x in i.find_all('b')]
29 |         values = [x.text for x in i.find_all('span')]
30 | 
31 |         for t in list(zip(keys, values)):
32 |             art[t[0]] = t[1]
33 | 
34 |         artworks.append(art)
35 | 
36 |     pickle.dump(artworks, open('artworks.pkl', 'wb'))
37 |     time.sleep(5)
38 | 
39 | pd.DataFrame(artworks).to_csv('artworks.csv', index=False)
40 | 


--------------------------------------------------------------------------------
/WikipediaRevisionHistory-API.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import re
 3 | import time
 4 | import re
 5 | from bs4 import BeautifulSoup
 6 | import json
 7 | import pickle
 8 | 
 9 | 
10 | def get_revisions(page_title, num_rev):
11 |     url = "https://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions&rvprop=ids|flags|timestamp|comment|user|content|tags|flags&rvlimit=1&rvdiffto=prev&titles=" + page_title
12 |     revisions = []
13 |     next_request = ''  # information for the next request
14 | 
15 |     # while True:
16 |     for i in range(num_rev):
17 |         response = json.loads(
18 |             requests.get(
19 |                 url +
20 |                 next_request).text)  # web request
21 | 
22 |         page_id = list(response['query']['pages'].keys())[0]
23 |         revisions.append(
24 |             response['query']['pages'][
25 |                 str(page_id)]['revisions'][0])
26 | 
27 |         cont = response['continue']['rvcontinue']
28 |         if not cont:  # break the loop if 'continue' element missing
29 |             break
30 | 
31 |         # gets the revision Id from which to start the next request
32 |         next_request = "&rvcontinue=" + cont
33 | 
34 |         time.sleep(1)
35 | 
36 |     return revisions
37 | 
38 | 
39 | page_names = pickle.load(open('page_names.pkl', 'rb'))
40 | 
41 | for p in page_names:
42 |     print(p)
43 |     results = get_revisions(p, 200)
44 |     pickle.dump(results, open('pickles/' + p + '.pkl', 'wb'))
45 | 


--------------------------------------------------------------------------------
/INOCAR-selenium.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium import webdriver
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.webdriver.support.ui import WebDriverWait
 5 | from selenium.webdriver.support import expected_conditions as EC
 6 | from selenium.common.exceptions import TimeoutException
 7 | from selenium.webdriver.support.ui import Select
 8 | from bs4 import BeautifulSoup
 9 | 
10 | 
11 | def init_driver():
12 |     driver = webdriver.Chrome()
13 |     driver.wait = WebDriverWait(driver, 5)
14 |     return driver
15 | 
16 | 
17 | def lookup(driver, query):
18 |     driver.get("http://www.inocar.mil.ec/mareas/pagina_mareas.php")
19 |     # a = driver.wait.until(EC.presence_of_element_located((By.NAME,
20 |     # "id_puerto")))
21 |     driver.find_element_by_xpath(
22 |         "//select[@name='id_puerto']/option[@value='378']").click()
23 |     driver.find_element_by_xpath(
24 |         "//select[@name='dias']/option[@value='1']").click()
25 |     driver.find_element_by_xpath(
26 |         "//select[@name='mes']/option[@value='1']").click()
27 |     driver.find_element_by_xpath(
28 |         "//select[@name='anio']/option[@value='2015']").click()
29 |     driver.find_element_by_name("Submit").click()
30 | 
31 |     html = driver.page_source
32 |     soup = BeautifulSoup(html, 'lxml')
33 |     a = soup.findAll("div")
34 |     print(a)
35 | 
36 | if __name__ == "__main__":
37 |     driver = init_driver()
38 |     lookup(driver, "Selenium")
39 |     time.sleep(5)
40 |     driver.quit()
41 | 


--------------------------------------------------------------------------------
/LucidChart-BS.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import json
 3 | import csv
 4 | import sys
 5 | 
 6 | # read in html source of chart
 7 | html_path = sys.argv[1]
 8 | with open(html_path, "r") as f:
 9 |     html = f.read()
10 | 
11 | soup = BeautifulSoup(html, "lxml")
12 | 
13 | # find line of JSON data
14 | raw_data = str(soup)[
15 |     str(soup).find("var doc = ") +
16 |     len("var doc = "):str(soup).find(";\n    doc.Document.state = doc.Document.state")]
17 | 
18 | figure_data = json.loads(raw_data)
19 | 
20 | # get states JSON
21 | states = json.loads(figure_data["Document"]['state'])
22 | 
23 | 
24 | def find_corr_text(thread_id, soup):
25 |     '''
26 |     find the text from a ThreadId
27 |     '''
28 |     item_id = states['Threads'][thread_id]["ItemId"]
29 |     loc = str(soup).find(item_id)
30 |     end = str(soup)[loc:].find("}}")
31 |     raw = str(soup)[loc + len(item_id) + 3:][:end - \
32 |               len(item_id) - 1].replace("\\", "")
33 | 
34 |     try:
35 |         props = json.loads(raw)
36 |         text = props["Properties"]["Text"]['t']
37 |     except:
38 |         return None
39 | 
40 |     return text
41 | 
42 | # cycle through comments and add text
43 | rows = []
44 | for k in states['Comments'].keys():
45 |     states['Comments'][k]['text'] = find_corr_text(
46 |         states['Comments'][k]['ThreadId'], soup)
47 |     rows.append(states['Comments'][k])
48 | 
49 | # write csv
50 | with open('lucidchart-comments.csv', 'w') as f:
51 |     w = csv.DictWriter(f, list(set(list(rows[0].keys()) + ['Type'])))
52 |     w.writeheader()
53 |     w.writerows(rows)
54 | 


--------------------------------------------------------------------------------
/MRIPNOAA-selenium.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import os
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | from selenium import webdriver
 6 | from selenium.webdriver.common.by import By
 7 | from selenium.webdriver.support.ui import WebDriverWait
 8 | from selenium.webdriver.support import expected_conditions as EC
 9 | from urllib.request import Request, urlopen
10 | import time
11 | 
12 | 
13 | driver = webdriver.Chrome()  # needs chromedriver in PATH
14 | 
15 | # iframed into
16 | # http://www.st.nmfs.noaa.gov/recreational-fisheries/MRIP/mrip-project
17 | driver.get("https://www.st.nmfs.noaa.gov/pims/#view=public_page&program_id=1")
18 | 
19 | time.sleep(15)
20 | 
21 | for i in range(11):
22 | 
23 |     projects = []
24 | 
25 |     for i in driver.find_elements_by_class_name("dijitTitlePaneTextNode"):
26 |         os.mkdir("MRIP/" + i.text)
27 |         projects.append(i.text)
28 | 
29 |     content_pane = driver.find_elements_by_class_name("dijitContentPane")[0]
30 |     links = content_pane.find_elements_by_class_name("docLink")
31 |     if len(links) > 0:
32 |         project_ct = -1
33 |         for l in links:
34 |             if l.text == "Proposal":  # begins each new project
35 |                 project_ct += 1
36 |                 with open("MRIP/" + projects[project_ct] + "/" + "source.html", 'w') as f:
37 |                     f.write(str(driver.page_source))
38 | 
39 |             res = urlopen(Request(l.get_attribute("href")))
40 |             with open("MRIP/" + projects[project_ct] + "/" + l.text + ".pdf", 'wb') as pdf:
41 |                 pdf.write(res.read())
42 | 
43 |             time.sleep(1)
44 | 
45 |     driver.find_element_by_id("dijit_form_Button_4_label").click()
46 |     time.sleep(1)
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/Kiva-API.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import time
 4 | 
 5 | status = ["funded", "expired"]
 6 | 
 7 | all_loans = []
 8 | 
 9 | for s in status:
10 |     for i in range(1, 2):  # change range to 10000
11 |         # set base url
12 |         base_url = "http://api.kivaws.org/v1/loans/search"
13 | 
14 |         # set response format
15 |         response_format = ".json"
16 | 
17 |         # set search parameters
18 |         search_params = {"status": s,
19 |                          "sort_by": "newest",
20 |                          "page": i}
21 | 
22 |         # make request
23 |         r = requests.get(base_url + response_format, params=search_params)
24 |         time.sleep(1.1)
25 |         response_text = r.text
26 | 
27 |         # Convert JSON response to a dictionary
28 |         data = json.loads(response_text)
29 | 
30 |         last_date = data["loans"][-1]["posted_date"]
31 | 
32 |         if "2016" in last_date[:4]:
33 |             for l in data["loans"]:
34 |                 l_id = str(l["id"])
35 | 
36 |                 # set base url
37 |                 base_url = "http://api.kivaws.org/v1/loans/"
38 | 
39 |                 # set response format
40 |                 response_format = ".json"
41 | 
42 |                 # make request
43 |                 r = requests.get(base_url + l_id + response_format)
44 |                 time.sleep(1.1)
45 |                 response_text = r.text
46 | 
47 |                 # Convert JSON response to a dictionary
48 |                 detailed_data = json.loads(response_text)
49 |                 final_data = detailed_data["loans"][0]
50 | 
51 |                 r = requests.get(base_url + l_id + "/teams" + response_format)
52 |                 time.sleep(1.1)
53 |                 response_text = r.text
54 |                 team_data = json.loads(response_text)
55 |                 final_data["team_count"] = len(team_data["teams"])
56 | 
57 |                 all_loans.append(final_data)
58 | 
59 |         else:
60 |             break
61 | 
62 | json.dump(all_loans, open("kiva_data.json", "w"))
63 | 


--------------------------------------------------------------------------------
/BGG-BS.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import urllib.parse
 3 | from bs4 import BeautifulSoup
 4 | import requests
 5 | import json
 6 | import time
 7 | 
 8 | df = pd.read_csv('game_data.csv')
 9 | game_names = set([x.replace(' Rules', '') for x in df['Title']])
10 | print(len(game_names))
11 | 
12 | all_dicts = []
13 | for g in game_names:
14 |     game = {'Title': g}
15 | 
16 |     enc = urllib.parse.quote_plus(g)
17 |     search_url = 'https://boardgamegeek.com/geeksearch.php?action=search&objecttype=boardgame&q={}&B1=Go'.format(
18 |         enc)
19 | 
20 |     print(search_url)
21 | 
22 |     res = requests.get(search_url).text
23 |     soup = BeautifulSoup(res, 'html5lib')
24 | 
25 |     first_result = soup.find('tr', {'id': 'row_'})
26 | 
27 |     try:
28 |         metadata = [
29 |             x.text.strip().replace(
30 |                 '\n',
31 |                 ' ').replace(
32 |                 '\t',
33 |                 '').replace(
34 |                 '  ',
35 |                 ' ') for x in first_result.find_all('td')]
36 |         game['rank'], game['name'], game['geek_rating'], game[
37 |             'avg_rating'], game['voters'] = [metadata[0]] + metadata[2:-1]
38 |         sub_url = 'https://boardgamegeek.com' + \
39 |             first_result.find_all('td')[2].find('a')['href']
40 | 
41 |         for l in requests.get(sub_url).text.split('\n'):
42 |             if l.strip().startswith('GEEK.geekitemPreload'):
43 |                 data = json.loads(l.strip()[23:-1])
44 |                 game = {**game, **data['item']['stats']}
45 | 
46 |         all_dicts.append(game)
47 |         json.dump(all_dicts, open('all_dicts.json', 'w'))
48 |         time.sleep(1)
49 | 
50 |     except:
51 |         all_dicts.append(game)
52 |         json.dump(all_dicts, open('all_dicts.json', 'w'))
53 |         time.sleep(1)
54 | 
55 | df2 = pd.DataFrame(all_dicts)
56 | 
57 | match = []
58 | for t in df2['Title']:
59 |     for o in df['Title']:
60 |         if o.startswith(t):
61 |             match.append(o)
62 |             break
63 | 
64 | df2['Title'] = match
65 | df.merge(df2, on=('Title')).to_csv('game_data_with_bgg.csv', index=False)
66 | 


--------------------------------------------------------------------------------
/BoardGameCapital-selenium.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver  # powers the browser interaction
 2 | from selenium.webdriver.support.ui import Select  # selects menu options
 3 | from bs4 import BeautifulSoup  # to parse HTML
 4 | import csv  # to write CSV
 5 | import pandas as pd  # to see CSV
 6 | import time
 7 | import os
 8 | import random
 9 | import requests
10 | import time as time_lib
11 | 
12 | driver = webdriver.Chrome()
13 | next_page = "http://www.boardgamecapital.com/board-game-rules.htm"
14 | driver.get(next_page)
15 | 
16 | soup = BeautifulSoup(driver.page_source, 'html5lib')
17 | game_cells = soup.find('tbody').find('tbody').find_all('td')[:-1]
18 | 
19 | game_dict = {}
20 | 
21 | for g in game_cells:
22 |     game_dict[g.text] = {}
23 |     game_dict[g.text]['link'] = 'http://www.boardgamecapital.com/' + \
24 |         g.find('a')['href']
25 | 
26 | for k in game_dict.keys():
27 |     print(k)
28 |     driver.get(game_dict[k]['link'])
29 | 
30 |     soup = BeautifulSoup(driver.page_source, 'html5lib')
31 | 
32 |     gstats1 = [x.split(':') for x in soup.find(
33 |         'div', {'class': 'gstats1'}).text.split('\n')]
34 |     price = gstats1[0][1].strip()[1:]
35 |     time = gstats1[1][1].strip()
36 | 
37 |     gstats2 = [x.split(':') for x in soup.find(
38 |         'div', {'class': 'gstats2'}).text.split('\n')]
39 |     age = gstats2[0][1].strip()
40 |     players = gstats2[1][1].strip()
41 | 
42 |     text = soup.find('div', {'class', 'mainbody'}).text
43 | 
44 |     pdf_links = [
45 |         a for a in soup.find(
46 |             'div', {
47 |                 'class', 'mainbody'}).find_all('a') if 'Game Rules' in a.text]
48 | 
49 |     paths = []
50 |     for url in pdf_links:
51 |         path = 'pdfs/{}.pdf'.format(url.text)
52 |         with open(path, 'wb') as f:
53 |             f.write(requests.get(url['href']).content)
54 | 
55 |         paths.append(path)
56 | 
57 |     paths = ';'.join(paths)
58 | 
59 |     game_dict[k]['price'] = price
60 |     game_dict[k]['time'] = time
61 |     game_dict[k]['age'] = age
62 |     game_dict[k]['players'] = players
63 |     game_dict[k]['paths'] = paths
64 |     game_dict[k]['web_text'] = text
65 | 
66 |     time_lib.sleep(1)
67 | 


--------------------------------------------------------------------------------
/IMSDB-BS.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup, NavigableString, Tag
 3 | import time
 4 | import urllib
 5 | import pickle
 6 | 
 7 | res = requests.get('http://www.imsdb.com/all%20scripts/').text
 8 | 
 9 | soup = BeautifulSoup(res, 'html5lib')
10 | 
11 | movies = soup.find_all('td', {'valign': 'top'})[2].find_all('p')
12 | 
13 | base_url = 'http://www.imsdb.com'
14 | movie_urls = [
15 |     base_url +
16 |     urllib.parse.quote(
17 |         m.find('a')['href']) for m in movies]
18 | 
19 | all_meta = []
20 | # all_meta = pickle.load(open('meta_dicts.pkl', 'rb'))
21 | for i, url in enumerate(movie_urls[:3]):
22 |     print(i)
23 |     res = requests.get(url).text
24 |     soup = BeautifulSoup(res, 'html5lib')
25 | 
26 |     script_details = soup.find('table', {'class': 'script-details'})
27 | 
28 |     title = script_details.find('h1').text.strip()
29 | 
30 |     split_details = script_details.find_all('td')[2]
31 | 
32 |     meta_data = {'title': title}
33 |     for t in split_details.find_all('b'):
34 | 
35 |         sibling_data = ''
36 |         for s in t.next_siblings:
37 |             if isinstance(s, NavigableString):
38 |                 if len(str(s).strip()) > 1:
39 |                     sibling_data += str(s).strip()
40 |                     break
41 |             elif isinstance(s, Tag):
42 |                 try:
43 |                     if s.name == 'a':
44 |                         sibling_data += s.text + ';'
45 |                 except:
46 |                     pass
47 | 
48 |                 if s.name == 'b':
49 |                     break
50 | 
51 |         meta_data[t.text] = sibling_data
52 | 
53 |     all_meta.append(meta_data)
54 | 
55 |     if "Read" in script_details.find_all('a')[-1].text:
56 | 
57 |         script_link = base_url + \
58 |             urllib.parse.quote(script_details.find_all('a')[-1]['href'])
59 | 
60 |         script_path = "scripts/" + title + '.html'
61 |         with open(script_path, 'w') as f:
62 |             f.write(requests.get(script_link).text)
63 | 
64 |     else:
65 |         script_path = "NA"
66 | 
67 |     meta_data['script_path'] = script_path
68 | 
69 |     pickle.dump(all_meta, open('meta_dicts.pkl', 'wb'))
70 | 
71 |     time.sleep(1)
72 | 


--------------------------------------------------------------------------------
/NHSTrustsInfo-BS.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import time
 4 | import csv
 5 | 
 6 | 
 7 | trust_url = 'https://www.nhs.uk/ServiceDirectories/Pages/NHSTrustListing.aspx'
 8 | res = requests.get(trust_url)
 9 | soup = BeautifulSoup(res.text, 'lxml')
10 | 
11 | all_trusts = [x for x in soup('a') if x['href'].startswith('/Services/Trusts/Overview/DefaultView.aspx?id=')]
12 | 
13 | all_items = []
14 | for t in all_trusts:
15 |     trust_name = t.text
16 |     print(trust_name)
17 |     trust_site = 'https://www.nhs.uk' + t['href'].replace('Overview', 'HospitalsAndClinics')
18 |     res = requests.get(trust_site)
19 |     soup = BeautifulSoup(res.text, 'lxml')
20 |     items = [x for x in soup.find_all('div', {'class': 'panel-content'}) if 'Address' in str(x)]
21 |     for i in items:
22 |         item_name = i.find('h3')
23 |         if item_name:
24 |             item_name = item_name.text
25 |         else:
26 |             continue
27 |             
28 |         if not i.find('a'):
29 |             continue
30 |             
31 |         if i.find('a')['href'].startswith('/Services'):
32 |             url = 'https://www.nhs.uk' + i.find('a')['href']
33 |             service_type = i.find('a')['href'].split('/')[2].title()
34 |         else:
35 |             url = i.find('a')['href']
36 |             service_type = 'Other'
37 | 
38 |         properties = [x.text for x in i.find('dl').find_all('dt')]
39 |         values = [BeautifulSoup(str(x).replace('<br/>', ', '), 'lxml').text for x in i.find('dl').find_all('dd')]
40 | 
41 |         info_dict = {'Name': item_name,
42 |                      'URL': url,
43 |                      'Type': service_type,
44 |                      'Trust Name': trust_name}
45 |         for i,k in enumerate(properties):
46 |             if k in ['PostCode', 'Ext', 'Website']:
47 |                 continue
48 |             info_dict[k.strip(':')] = values[i]
49 | 
50 |         all_items.append(info_dict)
51 |         
52 |     time.sleep(2)
53 | 
54 | 
55 | keys = ['Name', 'Trust Name', 'Type', 'Tel', 'Address', 'Email', 'URL']
56 | with open('nhs_sites.csv', 'w', newline='')  as output_file:
57 |     dict_writer = csv.DictWriter(output_file, keys)
58 |     dict_writer.writeheader()
59 |     dict_writer.writerows(all_items)
60 | 


--------------------------------------------------------------------------------
/INOCAR-AJAX.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import requests
 3 | from bs4 import BeautifulSoup
 4 | import re
 5 | import csv
 6 | import time
 7 | from random import randint
 8 | import pickle
 9 | import os.path
10 | 
11 | id_dict = {"61": "San Lorenzo", "377": "Esmeraldes"}
12 | days = [str(x) for x in list(range(1, 32))]
13 | months = [str(x) for x in list(range(1, 13))]
14 | years = [str(x) for x in list(range(2003, 2016))]
15 | 
16 | if os.path.isfile("already_scraped.pkl"):
17 |     already_scraped = pickle.load(open("already_scraped.pkl", "rb"))
18 | else:
19 |     already_scraped = []
20 | 
21 | for l in id_dict.keys():
22 |     for y in years:
23 |         for m in months:
24 |             for d in days:
25 |                 date = d + "/" + m + "/" + y
26 |                 if (l, date) not in already_scraped:
27 |                     payload = {
28 |                         "id_puerto": l,
29 |                         "dias": d,
30 |                         "mes": m,
31 |                         "anio": y,
32 |                         "task": "generate",
33 |                         "tipocon": "form_",
34 |                         "Submit": "Ver",
35 |                     }
36 | 
37 |                     r = requests.post(
38 |                         url='http://www.inocar.mil.ec/mareas/consulta.php',
39 |                         data=payload
40 |                     )
41 | 
42 |                     soup = BeautifulSoup(r.text, "lxml")
43 | 
44 |                     r1 = soup.findAll("tr", {"class": "row_1"})[2:4]
45 |                     r2 = soup.findAll("tr", {"class": "row_2"})[2:4]
46 |                     rows = [tuple(r1[0].get_text().split('\n')),
47 |                             tuple(r2[0].get_text().split('\n')),
48 |                             tuple(r1[1].get_text().split('\n')),
49 |                             tuple(r2[1].get_text().split('\n'))]
50 | 
51 |                     with open('data.csv', 'a') as f:
52 |                         a = csv.writer(f)
53 |                         for r in rows:
54 |                             row = (id_dict[l], date) + r
55 |                             a.writerow(row)
56 | 
57 |                     already_scraped.append((l, date))
58 |                     pickle.dump(
59 |                         already_scraped, open(
60 |                             "already_scraped.pkl", "wb"))
61 |                     time.sleep(randint(1, 3))
62 | 


--------------------------------------------------------------------------------
/GoogleGeoLatLong-API.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from urllib.request import Request, urlopen
 3 | import time
 4 | import csv
 5 | 
 6 | 
 7 | def getJson(lat, longi):
 8 |     url = 'http://maps.googleapis.com/maps/api/geocode/json?latlng=%s,%s&sensor=true' % \
 9 |           (lat, longi)
10 | 
11 |     req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
12 | 
13 |     response = urlopen(req).read().decode('utf-8')
14 |     responseJson = json.loads(response)['results']
15 | 
16 |     return responseJson
17 | 
18 | latlong = [(18.6, -
19 |             100.566667), (19.6, -
20 |                           100.566667), (19.6, -
21 |                                         101.566667), (17.6, -
22 |                                                       100.566667), (27.121381, -
23 |                                                                     107.200644), (37.586630, -
24 |                                                                                   123.233372), (25.267348, -
25 |                                                                                                 120.087235), (19.6, -
26 |                                                                                                               96.566667), (17.6, -
27 |                                                                                                                            98.566667), (37.882042, -
28 |                                                                                                                                         122.277562)]
29 | 
30 | municps = []
31 | for coord in latlong:
32 |     switch = 0
33 |     info = getJson(coord[0], coord[1])
34 |     # municps.append(info.get("results")[1].get("address_components")[0].get("long_name"))
35 |     # #if certain data is there
36 |     for result in info:  # to avoid errors if incorrect data
37 |         for address_component in result['address_components']:
38 |             if address_component['types'] == [
39 |                     "administrative_area_level_2", "political"]:
40 |                 municps.append(address_component['long_name'])
41 |                 switch = 1
42 |                 break
43 |             break
44 | 
45 |     if switch == 1:
46 |         continue
47 |     else:
48 |         municps.append("None")
49 | 
50 |     time.sleep(.11)
51 | 
52 | 
53 | latlongname = list(zip(latlong, municps))
54 | 
55 | with open('data.csv', 'w') as out:
56 |     csv_out = csv.writer(out)
57 |     csv_out.writerow(['lat-long', 'name'])
58 |     for row in latlongname:
59 |         csv_out.writerow(row)
60 | 


--------------------------------------------------------------------------------
/LARRP-BS.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup  # to parse HTML
 2 | import csv  # to write CSV
 3 | import pandas as pd  # to see CSV
 4 | import time
 5 | import os
 6 | import random
 7 | import requests
 8 | 
 9 | 
10 | def dl_pages(base_url, books, pres):
11 |     os.mkdir(pres)
12 |     for i1, b in enumerate(books):
13 |         next_page = b
14 |         res = requests.get(next_page).text
15 |         soup = BeautifulSoup(res, 'html5lib')
16 |         book_title = soup.find('h3').text
17 | 
18 |         os.mkdir(pres + '/' + book_title + '-' + str(i1))
19 | 
20 |         try:
21 |             for i in range(1, 10000):
22 |                 res = requests.get(next_page).text
23 | 
24 |                 soup = BeautifulSoup(res, 'html5lib')
25 | 
26 |                 if 'Discurso al proclamarse su candidatura' in book_title:
27 |                     next_page = base_url + \
28 |                         soup.find('center').find_all('a')[1]['href']
29 |                 else:
30 |                     next_page = base_url + \
31 |                         soup.find('center').find_all('a')[2]['href']
32 | 
33 |                 tif_link = base_url + \
34 |                     [x['href'] for x in soup.find_all('a') if 'tif' in x['href']][0]
35 | 
36 |                 res = requests.get(tif_link).content
37 | 
38 |                 with open(pres + '/' + book_title + '-' + str(i1) + '/page-' + str(i) + '.tif', 'wb') as f:
39 |                     f.write(res)
40 | 
41 |                 time.sleep(1)
42 |         except:
43 |             continue
44 | 
45 | 
46 | books = [
47 |     'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/180002t.html',
48 |     'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/190117t.html',
49 |     'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/200253t.html',
50 |     'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/210286t.html',
51 |     'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/170347.html']
52 | 
53 | base_url = 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/'
54 | 
55 | dl_pages(base_url, books, 'yrigoyen')
56 | 
57 | 
58 | res = requests.get(
59 |     'http://lanic.utexas.edu/larrp/pm/sample2/argentin/peron/index.html').text
60 | 
61 | soup = BeautifulSoup(res, 'html5lib')
62 | 
63 | books = []
64 | base_url = 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/peron/'
65 | for li in soup.find('ul').find_all('li'):
66 |     link = [x for x in li.find_all('a') if 'idx' not in x['href']][0]
67 | 
68 |     if not link.text.strip().startswith('I'):
69 |         books.append(base_url + link['href'])
70 | 
71 | 
72 | dl_pages(base_url, books, 'peron')
73 | 


--------------------------------------------------------------------------------
/Wiktionary-API.py:
--------------------------------------------------------------------------------
 1 | '''This script scrapes wiktionary to get MHG lemmas of NHG lemmas.'''
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | from urllib.request import Request, urlopen
 5 | import time
 6 | from string import punctuation
 7 | import urllib.parse
 8 | import treetaggerwrapper
 9 | import json
10 | import time
11 | from random import randint
12 | import os
13 | import pyprind
14 | 
15 | 
16 | # get words from freq list and translations
17 | with open("top10000.txt", "r") as f:
18 |     words = f.read().split()
19 | 
20 | with open("NHG.txt", "r") as f:
21 |     more_words = f.read().split()
22 | 
23 | all_words = set(words + more_words)
24 | 
25 | # turn words to set of lemmas
26 | tagger = treetaggerwrapper.TreeTagger(TAGLANG='de')
27 | 
28 | lemmas = []
29 | for w in all_words:
30 |     lemm = tagger.tag_text(w)[0].split("\t")[-1]
31 |     lemmas.append(lemm)
32 | 
33 | lemmas = set(lemmas)
34 | 
35 | # start scraping here
36 | base = "https://de.wiktionary.org/w/api.php?format=xml&action=query&titles="
37 | branch = "&rvprop=content&prop=revisions&redirects=1"
38 | 
39 | if os.path.isfile("cognate_dict.json"):
40 |     cognate_dict = json.load(open("cognate_dict.json", "r"))
41 | else:
42 |     cognate_dict = {}
43 | 
44 | bar = pyprind.ProgBar(len(lemmas), monitor=True, bar_char="#")
45 | for w in lemmas:
46 | 
47 |     if w not in cognate_dict:
48 | 
49 |         # for UTF-8 URL parsing
50 |         url = base + w + branch
51 |         url_word = urllib.parse.quote(w)
52 |         url = base + url_word + branch
53 | 
54 |         html = urlopen(url)
55 |         bsObj = BeautifulSoup(html.read(), "lxml")
56 |         text = bsObj.get_text()
57 | 
58 |         if "mittelhochdeutsch" in text:
59 |             ind = text.index("mittelhochdeutsch")
60 |             cognates = text[ind:].split("''")
61 | 
62 |             if len(cognates) > 1:
63 |                 cognates = cognates[1].split()
64 |                 for i, c in enumerate(cognates):
65 |                     if "|" in c:
66 |                         cognates[i] = c.split("|")[-1]
67 | 
68 |                 for char in punctuation:
69 |                     cognates = [c.replace(char, "") for c in cognates]
70 | 
71 |                 cognates = [c for c in cognates if len(c) > 0 and c[
72 |                     0].isalpha()]
73 | 
74 |                 cognate_dict[w] = cognates
75 | 
76 |                 with open("cognate_dict.json", "w") as f:
77 |                     json.dump(cognate_dict, f)
78 | 
79 |                 time.sleep(randint(1, 3))
80 | 
81 |             else:
82 |                 cognate_dict[w] = None
83 | 
84 |                 with open("cognate_dict.json", "w") as f:
85 |                     json.dump(cognate_dict, f)
86 | 
87 |         else:
88 | 
89 |             cognate_dict[w] = None
90 | 
91 |             with open("cognate_dict.json", "w") as f:
92 |                 json.dump(cognate_dict, f)
93 | 
94 |     bar.update()
95 | 
96 | print("Done!")
97 | 


--------------------------------------------------------------------------------
/ADA-ERP-BS.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | import time
 5 | import pickle
 6 | import csv
 7 | 
 8 | 
 9 | def get_pages(soup):
10 |     '''
11 |     gets links to any subsequent pages
12 |     '''
13 |     base = 'https://professional.diabetes.org'
14 |     try:
15 |         page_links = soup.find('ul', {'class': 'pagination'}).find_all('a')
16 |         links = [base + a['href'] for a in page_links]
17 |         return set(links)
18 |     except:
19 |         return None
20 | 
21 | 
22 | def get_org_dicts(soup):
23 |     '''
24 |     turn any listed organizations on page to dictionaries
25 |     '''
26 | 
27 |     orgs = soup.find_all('div', {'class': 'col col-sm-4'})
28 | 
29 |     org_dicts = []
30 | 
31 |     for o in orgs:
32 |         meta = o.find_all('div')
33 |         org_dict = {}
34 | 
35 |         # up to colon is key after is value
36 |         pattern = re.compile('(.*?):(.*)')
37 |         for m in meta:
38 |             try:
39 |                 groups = re.search(pattern, m.text).groups()
40 |                 title = groups[0].strip()
41 |                 value = groups[1].strip()
42 |                 org_dict[title] = value
43 |             except:
44 |                 pass
45 | 
46 |         org_dicts.append(org_dict)
47 | 
48 |     return org_dicts
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     # get list of states from sample URL
53 |     init = 'https://professional.diabetes.org/erp_list?field_erp_state_value=NY'
54 |     res = requests.get(init)
55 |     soup = BeautifulSoup(res.text, 'html5lib')
56 |     options = soup.find(
57 |         'select', {'id': 'edit-field-erp-state-value'}).find_all('option')
58 |     states = [x['value'] for x in options]
59 | 
60 |     # start iteration through state URLS
61 |     all_dicts = []
62 |     for s in states:
63 |         print(s)
64 |         state_link = 'https://professional.diabetes.org/erp_list?field_erp_state_value={}'.format(
65 |             s)
66 |         res = requests.get(state_link)
67 |         soup = BeautifulSoup(res.text, 'html5lib')
68 | 
69 |         # get dicts
70 |         all_dicts.extend(get_org_dicts(soup))
71 |         pickle.dump(all_dicts, open('all-dicts.pkl', 'wb'))
72 | 
73 |         # get extra pages
74 |         pages = get_pages(soup)
75 | 
76 |         # cycle through subsequent pages
77 |         if pages != None:
78 |             for p in pages:
79 |                 res = requests.get(p)
80 |                 soup = BeautifulSoup(res.text, 'html5lib')
81 |                 all_dicts.extend(get_org_dicts(soup))
82 |                 time.sleep(1)
83 |                 pickle.dump(all_dicts, open('all-dicts.pkl', 'wb'))
84 |         time.sleep(1)
85 | 
86 |     # dump csv
87 |     with open('erp.csv', 'w') as csvfile:
88 |         fieldnames = list(all_dicts[0].keys())
89 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
90 |         writer.writeheader()
91 |         writer.writerows(all_dicts)
92 | 


--------------------------------------------------------------------------------
/RioGrandeGames-selenium.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver  # powers the browser interaction
 2 | from selenium.webdriver.support.ui import Select  # selects menu options
 3 | from bs4 import BeautifulSoup  # to parse HTML
 4 | import csv  # to write CSV
 5 | import pandas as pd  # to see CSV
 6 | import time
 7 | import os
 8 | import random
 9 | import requests
10 | import pickle
11 | 
12 | 
13 | driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
14 | driver.get('http://riograndegames.com/search.html?category%5B%5D=5&category%5B%5D=10&category%5B%5D=14&category%5B%5D=1&category%5B%5D=2&category%5B%5D=12&category%5B%5D=3&category%5B%5D=6&category%5B%5D=8&category%5B%5D=9&category%5B%5D=4&category%5B%5D=13&category%5B%5D=22&category%5B%5D=16&category%5B%5D=11&category%5B%5D=7&category%5B%5D=17&category%5B%5D=18&category%5B%5D=15&language=0&min_players=0&length=0&min_age=0&term=')
15 | search_results = driver.find_element_by_css_selector(
16 |     'div#search_results.isotope').find_elements_by_css_selector('div.search_item.isotope-item')
17 | 
18 | games_dicts = []
19 | attributes = [
20 |     'data-title',
21 |     'data-orig',
22 |     'data-length',
23 |     'data-date',
24 |     'data-age',
25 |     'data-players',
26 |     'data-msrp']
27 | 
28 | for s in search_results:
29 |     game = {}
30 |     for a in attributes:
31 |         game[a] = s.get_attribute(a)
32 | 
33 |     game['page_link'] = s.find_element_by_css_selector(
34 |         'a').get_attribute('href')
35 | 
36 |     games_dicts.append(game)
37 | 
38 | 
39 | final_games_dicts = []
40 | for g in games_dicts:
41 |     print(g['data-title'])
42 |     driver.get(g['page_link'])
43 |     cats = driver.find_elements_by_css_selector('span.game_cat')
44 |     cats = [c.text.replace(',', '') for c in cats]
45 |     g['game_category'] = ';'.join(cats)
46 | 
47 |     # unfold and download
48 |     driver.find_element_by_css_selector('span.button2').click()
49 | 
50 |     asset_links = driver.find_elements_by_css_selector('p.asset_list a')
51 | 
52 |     for a in asset_links:
53 |         images = a.find_elements_by_css_selector("img")
54 |         for i in images:
55 |             if "rules" in i.get_attribute('title').lower():
56 |                 download = a.get_attribute('href')
57 |                 session = requests.Session()
58 |                 cookies = driver.get_cookies()
59 | 
60 |                 for cookie in cookies:
61 |                     session.cookies.set(cookie['name'], cookie['value'])
62 |                 response = session.get(download)
63 | 
64 |                 dl_path = 'pdfs/' + g['data-title'] + '.pdf'
65 | 
66 |                 with open(dl_path, 'wb') as f:
67 |                     f.write(response.content)
68 | 
69 |                 g['pdf_path'] = dl_path
70 |                 final_games_dicts.append(g)
71 |                 pickle.dump(final_games_dicts, open('game_dicts.pkl', 'wb'))
72 | 
73 |                 time.sleep(1)
74 |                 break
75 |         break
76 | 
77 |     time.sleep(1)
78 | 


--------------------------------------------------------------------------------
/STNMFSNOAA-BS.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import os
 4 | import time
 5 | 
 6 | # send payload to get list of species
 7 | payload = {'qwhocalled': 'monthly',
 8 |            'qcommon': '',
 9 |            'qreturn': 'Search',
10 |            'qselect': 'List Empty, Do a Search to Fill'}
11 | r = requests.get(
12 |     'https://www.st.nmfs.noaa.gov/pls/webpls/FT_HELP.SPECIES',
13 |     params=payload)
14 | 
15 | soup = BeautifulSoup(r.content, "lxml")
16 | species = [x.text for x in soup.findAll("option")]
17 | 
18 | # iterate through species
19 | for sp in species:
20 | 
21 |     if not os.path.exists(sp.replace(",", "").replace(
22 |             " ", "-").replace("/", "_")):  # if need to restart script
23 | 
24 |         # make directory for species
25 |         os.mkdir(sp.replace(",", "").replace(" ", "-").replace("/", "_"))
26 | 
27 |         # send payload to get different states and regions
28 |         payload = {'qwhocalled': 'monthly',
29 |                    'qcommon': '',
30 |                    'qreturn': 'Return',
31 |                    'qselect': sp}
32 |         r = requests.get(
33 |             'https://www.st.nmfs.noaa.gov/pls/webpls/FT_HELP.SPECIES',
34 |             params=payload)
35 | 
36 |         soup = BeautifulSoup(r.content, "lxml")
37 |         states = [
38 |             x.text for x in soup.find(
39 |                 "select", {
40 |                     "name": "qstate"}).findAll("option")]
41 | 
42 |         # iterate through different regions and states
43 |         for st in states:
44 | 
45 |             payload = {'qspecies': sp,
46 |                        'qreturn': 'Species Locator',
47 |                        'qyearfrom': '1990',
48 |                        'qyearto': '2015',
49 |                        'qmonth': 'YEAR BY MONTH',
50 |                        'qstate': st,
51 |                        'qoutput_type': 'TABLE'}
52 |             r = requests.get(
53 |                 'http://www.st.nmfs.noaa.gov/pls/webpls/MF_MONTHLY_LANDINGS.RESULTS',
54 |                 params=payload)
55 | 
56 |             # save html tables into folders
57 |             with open(sp.replace(",", "").replace(" ", "-").replace("/", "_") + "/" + st + ".html", "w") as f:
58 |                 f.write(str(r.content))
59 | 
60 |             # don't overload server
61 |             time.sleep(.1)
62 | 
63 | # get all species from main page
64 | os.mkdir('ALL-SPECIES-COMBINED')
65 | 
66 | # iterate through different states and regions
67 | for st in states:
68 | 
69 |     payload = {'qspecies': 'ALL SPECIES COMBINED',
70 |                'qreturn': 'Species Locator',
71 |                'qyearfrom': '1990',
72 |                'qyearto': '2015',
73 |                'qmonth': 'YEAR BY MONTH',
74 |                'qstate': st,
75 |                'qoutput_type': 'TABLE'}
76 | 
77 |     r = requests.get(
78 |         'https://www.st.nmfs.noaa.gov/pls/webpls/MF_MONTHLY_LANDINGS.RESULTS',
79 |         params=payload)
80 | 
81 |     with open('ALL-SPECIES-COMBINED' + "/" + st + ".html", "w") as f:
82 |         f.write(str(r.content))
83 | 


--------------------------------------------------------------------------------
/BAAD-BS.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup  # to parse HTML
 2 | import csv  # to write CSV
 3 | import pandas as pd  # to see CSV
 4 | import time
 5 | import os
 6 | import random
 7 | import requests
 8 | 
 9 | next_page = 'http://www.start.umd.edu/baad/database'
10 | base_url = 'http://www.start.umd.edu'
11 | 
12 | all_rows = []
13 | all_rows.append(['ID',
14 |                  'Group Name',
15 |                  'Country',
16 |                  'Lethality',
17 |                  'Number of Allies',
18 |                  'Number of Rivals',
19 |                  'Founded',
20 |                  'Fatalities',
21 |                  'Fatality Years',
22 |                  'Ideologies',
23 |                  'Strength',
24 |                  'Territorial Control',
25 |                  'Funding through Drug Trafficking',
26 |                  'Record Year'])
27 | 
28 | for i in range(1, 6):
29 |     res = requests.get(next_page).text
30 | 
31 |     soup = BeautifulSoup(res, 'html5lib')
32 | 
33 |     rows = soup.find('table', {'class', 'sticky-enabled'}).find_all('tr')
34 |     rows = rows[1:]
35 | 
36 |     for r in rows:
37 |         cells = r.find_all('td')
38 |         cell_text = [x.text.strip() for x in cells]
39 |         link = base_url + cells[0].find('a')['href']
40 | 
41 |         res = requests.get(link).text
42 |         soup = BeautifulSoup(res, 'html5lib')
43 | 
44 |         year_bullets = soup.find('div', {'class': 'item-list'}).find_all('li')
45 |         year_urls = [(base_url + x.find('a')['href'],
46 |                       x.find('a').text.strip()) for x in year_bullets]
47 |         for u in year_urls:
48 |             record_year = u[1]
49 |             res = requests.get(u[0]).text
50 |             soup = BeautifulSoup(res, 'html5lib')
51 | 
52 |             founded = soup.find(
53 |                 'div', {'class', 'quick-view-founded'}).text.split(':')[-1].strip()
54 |             fatalities, fatality_years = soup.find(
55 |                 'div', {'class', 'quick-view-lethality'}).text.split(':')[-1].strip().split(' ', maxsplit=1)
56 |             ideology = soup.find(
57 |                 'div', {'class', 'quick-view-ideology'}).text.split(':')[-1].strip()
58 |             strength = soup.find(
59 |                 'div', {'class', 'quick-view-strength'}).text.split(':')[-1].strip()
60 |             terrcnt = soup.find(
61 |                 'div', {'class', 'quick-view-terrcnt'}).text.split(':')[-1].strip()
62 |             drugs = soup.find(
63 |                 'div', {'class', 'quick-view-drug-funding'}).text.split(':')[-1].strip()
64 | 
65 |             data_row = [
66 |                 cell_text[0] + '-' + record_year] + cell_text + [
67 |                 founded,
68 |                 fatalities,
69 |                 fatality_years,
70 |                 ideology,
71 |                 strength,
72 |                 terrcnt,
73 |                 drugs,
74 |                 record_year]
75 |             print(data_row)
76 |             all_rows.append(data_row)
77 | 
78 |             time.sleep(1)
79 | 
80 |         time.sleep(1)
81 | 
82 |     next_page = 'http://www.start.umd.edu/baad/database?page={}'.format(str(i))
83 |     time.sleep(1)
84 | 
85 | 
86 | with open("baad.csv", "w") as f:
87 |     csv_w = csv.writer(f)
88 |     csv_w.writerows(all_rows)
89 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # web-scrapers
 2 | 
 3 | Various web scrapers for research and fun:
 4 | 
 5 | - [Board Game Capital](http://www.boardgamecapital.com/board-game-rules.htm)
 6 | - [CTS Net](https://www.ctsnet.org/)
 7 | - [Minutes of the Federal Reserve Board of Governors discount rate](https://www.federalreserve.gov/monetarypolicy/discountrate.htm)
 8 | - [Doximity](https://www.doximity.com/)
 9 | - [Energy - The Automated Register of Implemented Actions](https://www.energy.gov/eere/downloads/automated-register-implemented-actions)
10 | - [Lucid Chart](https://www.lucidchart.com/)
11 | - [GLERL NOAA](https://www.glerl.noaa.gov//metdata/status/status_archive/)
12 | - [American Historical Association](http://careers.historians.org/jobs/)
13 | - [Grimm Fairy Tales](https://www.cs.cmu.edu/~spok/grimmtmp/)
14 | - [Perrault Fairy Tales](http://www.pitt.edu/~dash/perrault.html)
15 | - [IMSDB](http://www.imsdb.com/all%20scripts/)
16 | - [Glass Door API](https://www.glassdoor.com/index.htm)
17 | - [Crunch Base API](https://data.crunchbase.com/docs)
18 | - [Google Directions](https://www.google.com/maps/dir/)
19 | - [INOCAR](http://www.inocar.mil.ec/web/index.php)
20 | - [Kalamazoo](http://scholarworks.wmich.edu/)
21 | - [Kiva API](https://www.kiva.org/)
22 | - [Google Geocoding API](https://developers.google.com/maps/documentation/geocoding/start)
23 | - [Google Search](https://www.google.com/)
24 | - [GLO Records](https://glorecords.blm.gov/default.aspx)
25 | - [MRIP NOAA](http://www.st.nmfs.noaa.gov/recreational-fisheries/MRIP/mrip-project)
26 | - [Web of Science](http://ipscience-help.thomsonreuters.com/LAMRService/WebServiceOperationsGroup/requestAPIWoS.html)
27 | - [ST NMFS NOAA](https://www.st.nmfs.noaa.gov/)
28 | - [NCDC NOAA](https://www.ncdc.noaa.gov/cdr/atmospheric/total-solar-irradiance)
29 | - [Open Secrets](https://www.opensecrets.org/resources/create/apis.php)
30 | - [Resident Advisor](https://www.residentadvisor.net/reviews.aspx?format=single)
31 | - [Rate My Professors](http://www.ratemyprofessors.com/)
32 | - [LARRP](http://lanic.utexas.edu/larrp/pm/sample2/)
33 | - [Wiktionary](https://de.wiktionary.org/)
34 | - [Wikipedia Revision History API](https://www.mediawiki.org/wiki/API:Revisions)
35 | - [Big, Allied and Dangerous BAAD](http://www.start.umd.edu/baad/database)
36 | - [Rio Grande Games](http://riograndegames.com/)
37 | - [Bayerische Staatsbibliothek](https://opacplus.bsb-muenchen.de/)
38 | - [DataSF Fire Incidents API](https://data.sfgov.org/Public-Safety/Fire-Incidents/wr8u-xric)
39 | - [Google Geocoding API Searches](https://developers.google.com/maps/documentation/geocoding/start)
40 | - [Board Game Geek](https://boardgamegeek.com/)
41 | - [Data Mart Basic Skills](http://datamart.cccco.edu/Outcomes/BasicSkills_Cohort_Tracker.aspx)
42 | - [Public Access to Court Electronic Records (PACER)](https://www.pacer.gov/)
43 | - [SF Planning Commission Minutes](http://default.sfplanning.org/meetingarchive/planning_dept/sf-planning.org/index.aspx-page=1000.html)
44 | - [American Diabetes Association ERP Resources](https://professional.diabetes.org/erp_list?field_erp_state_value=NY)
45 | - [National Stolen Art File](https://www.fbi.gov/investigate/violent-crime/art-theft/national-stolen-art-file)
46 | - [NHS Trusts](https://www.nhs.uk/ServiceDirectories/Pages/NHSTrustListing.aspx)
47 | 


--------------------------------------------------------------------------------
/BSBDigitaleSammlungen-API.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver  # powers the browser interaction
  2 | from selenium.webdriver.support.ui import Select  # selects menu options
  3 | from bs4 import BeautifulSoup  # to parse HTML
  4 | import csv  # to write CSV
  5 | import pandas as pd  # to see CSV
  6 | import time
  7 | import os
  8 | import random
  9 | import requests
 10 | import re
 11 | import pickle
 12 | import numpy as np
 13 | 
 14 | 
 15 | # PART 1
 16 | # first collect bsb ids from search of years 700-1400
 17 | driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
 18 | 
 19 | driver.maximize_window()
 20 | 
 21 | driver.get("https://opacplus.bsb-muenchen.de/metaopac/start.do")
 22 | driver.find_element_by_css_selector(
 23 |     'input#searchRestrictionValue1_2.form-control').send_keys('700')
 24 | driver.find_element_by_css_selector(
 25 |     'input#searchRestrictionValue2_2.form-control').send_keys('1400')
 26 | driver.find_element_by_css_selector(
 27 |     'input#submitSearch.btn.btn-default.dbuttonb').click()
 28 | driver.find_element_by_css_selector(
 29 |     '#availableFacets > ul > li:nth-child(4) > ul > li:nth-child(5) > a > span.hidden-xs').click()
 30 | 
 31 | time.sleep(5)
 32 | 
 33 | print(driver.find_element_by_css_selector(
 34 |     '#speed_result_list_100 > div > div.nav.nav-tabs.box-header.navigation > div.col-xs-9.col-md-5 > h2').text)
 35 | 
 36 | bsbs = []
 37 | pattern = r'bsb[0-9]+'
 38 | 
 39 | for i in range(2000):
 40 | 
 41 |     print(i)
 42 | 
 43 |     soup = BeautifulSoup(driver.page_source, 'html5lib')
 44 | 
 45 |     rows = soup.find_all('td', {'class': 'resultscell'})
 46 | 
 47 |     for r in rows:
 48 |         links = r.find_all('a')
 49 |         for l in links:
 50 |             if re.search(pattern, l['href']):
 51 |                 bsbs.append(re.search(pattern, l['href']).group())
 52 | 
 53 |     pickle.dump(bsbs, open('bsbs.pkl', 'wb'))
 54 | 
 55 |     driver.find_element_by_css_selector(
 56 |         '#speed_result_list_100 > div > div.nav.nav-tabs.box-header.navigation > div.hidden-xs.hidden-sm.col-xs-7.col-md-7.pull-right.pagination > div > ul > li:nth-child(8) > a').click()
 57 |     time.sleep(5)
 58 | 
 59 | 
 60 | # PART 2
 61 | # now read in list of bsb ids and collect API data
 62 | 
 63 | 
 64 | def get_dimensions(res):
 65 | 
 66 |     width = []
 67 |     height = []
 68 |     for p in res['sequences'][0]['canvases']:
 69 |         try:
 70 |             scale = p['service']['physicalScale']
 71 |             width.append(p['width'] * scale)
 72 |             height.append(p['height'] * scale)
 73 |         except:
 74 |             pass
 75 | 
 76 |     return (np.mean(height), np.mean(width))
 77 | 
 78 | bsbs = pickle.load(open('bsbs.pkl', 'rb'))
 79 | data_dicts = []
 80 | 
 81 | for bsb in bsbs:
 82 |     print(bsb)
 83 | 
 84 |     try:
 85 |         res = requests.get(
 86 |             'https://api.digitale-sammlungen.de/iiif/presentation/v2/{}/manifest'.format(bsb)).json()
 87 |         hs_dict = {}
 88 |         hs_dict['Thumbnail'] = res['thumbnail']['@id']
 89 |         hs_dict['Label'] = res['label']
 90 | 
 91 |         for m in res['metadata']:
 92 |             key = m['label'][1]['@value']
 93 |             value = m['value']
 94 | 
 95 |             if isinstance(value, list):
 96 |                 value = value[-1]['@value']
 97 | 
 98 |             hs_dict[key.strip()] = value.strip()
 99 | 
100 |         hs_dict['Height'], hs_dict['Width'] = get_dimensions(res)
101 | 
102 |         data_dicts.append(hs_dict)
103 |         pickle.dump(data_dicts, open('data_dicts.pkl', 'wb'))
104 | 
105 |     except:
106 |         pass
107 | 
108 |     time.sleep(3)
109 | 


--------------------------------------------------------------------------------
/AHA-selenium.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver  # powers the browser interaction
  2 | from selenium.webdriver.support.ui import Select  # selects menu options
  3 | from bs4 import BeautifulSoup  # to parse HTML
  4 | import csv  # to write CSV
  5 | import pandas as pd  # to see CSV
  6 | import time
  7 | import os
  8 | import random
  9 | import requests
 10 | 
 11 | driver = webdriver.Chrome()
 12 | driver.get("http://careers.historians.org/jobs/?page=1")
 13 | 
 14 | base_url = 'http://careers.historians.org'
 15 | all_rows = []
 16 | pages = ["http://careers.historians.org/jobs/?page=1",
 17 |          "http://careers.historians.org/jobs/?page=2"]
 18 | 
 19 | for p in pages:
 20 |     driver.get(p)
 21 |     soup = BeautifulSoup(driver.page_source, 'html5lib')
 22 | 
 23 |     rows = soup.find_all('div', {'class': 'bti-ui-job-detail-container'})
 24 |     for r in rows:
 25 |         title = r.find('a').text.strip()
 26 |         link = base_url + r.find('a')['href']
 27 |         employer = r.find(
 28 |             'div', {
 29 |                 'class': 'bti-ui-job-result-detail-employer'}).text.strip()
 30 |         location = r.find(
 31 |             'div', {
 32 |                 'class': 'bti-ui-job-result-detail-location'}).text.strip()
 33 |         date_posted = r.find(
 34 |             'div', {
 35 |                 'class': 'bti-ui-job-result-detail-age'}).text.strip()
 36 | 
 37 |         driver.get(link)
 38 | 
 39 |         soup = BeautifulSoup(driver.page_source, 'html5lib')
 40 | 
 41 |         try:
 42 |             job_description = soup.find(
 43 |                 'div', {'class': 'bti-jd-description'}).text.strip()
 44 | 
 45 |             details = soup.find('div', {'class': 'bti-jd-details-container'})
 46 | 
 47 |             details_titles = [
 48 |                 x.text.replace(
 49 |                     ':', '').lower().strip() for x in details.find_all(
 50 |                     'div', {
 51 |                         'class': 'bti-jd-detail-title'})]
 52 |             details_text = [
 53 |                 x.text.strip() for x in details.find_all(
 54 |                     'div', {
 55 |                         'class': 'bti-jd-detail-text'})]
 56 | 
 57 |             details_dict = {}
 58 | 
 59 |             for i in range(len(details_titles)):
 60 |                 t = details_titles[i]
 61 |                 if 'categories' in t:
 62 |                     t = 'category'
 63 |                 elif 'required' in t:
 64 |                     t = 'preferred education'
 65 |                 details_dict[t] = details_text[i]
 66 | 
 67 |             details_dict['title'] = title
 68 |             details_dict['link'] = link
 69 |             details_dict['employer'] = employer
 70 |             details_dict['location'] = location
 71 |             details_dict['date_posted'] = date_posted
 72 |             details_dict['job_description'] = job_description
 73 | 
 74 |             try:
 75 |                 details_dict['employer_about'] = soup.find(
 76 |                     'div', {'class': 'bti-jd-employer-info'}).text.strip()
 77 |             except:
 78 |                 details_dict['employer_about'] = ''
 79 | 
 80 |             all_rows.append(details_dict)
 81 | 
 82 |         except:
 83 |             pass
 84 | 
 85 |         time.sleep(1)
 86 | 
 87 | header = ["title",
 88 |           "employer",
 89 |           "location",
 90 |           "posted",
 91 |           "date_posted",
 92 |           "primary field",
 93 |           "category",
 94 |           "preferred education",
 95 |           "salary",
 96 |           "type",
 97 |           "employment type",
 98 |           "job_description",
 99 |           "employer_about",
100 |           "link"
101 |           ]
102 | 
103 | 
104 | with open('AHA-data.csv', 'w') as f:
105 |     w = csv.DictWriter(f, header)
106 |     w.writeheader()
107 |     w.writerows(all_rows)
108 | 


--------------------------------------------------------------------------------
/ResidentAdvisor-selenium.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver  # powers the browser interaction
  2 | from selenium.webdriver.support.ui import Select  # selects menu options
  3 | from bs4 import BeautifulSoup  # to parse HTML
  4 | import csv  # to write CSV
  5 | import pandas as pd  # to see CSV
  6 | import time
  7 | import os
  8 | import random
  9 | 
 10 | driver = webdriver.PhantomJS()
 11 | next_page = "https://www.residentadvisor.net/reviews.aspx?format=single"
 12 | 
 13 | with open("resident-adv.csv", "a") as f:
 14 |     csv_w_interv = csv.writer(f)
 15 |     csv_w_interv.writerow(["title",
 16 |                            "artist",
 17 |                            "single",
 18 |                            "label",
 19 |                            "record",
 20 |                            "style",
 21 |                            "reviewed_date",
 22 |                            "release_date",
 23 |                            "comments",
 24 |                            "rating",
 25 |                            "description",
 26 |                            "URL"])
 27 | 
 28 | 
 29 | for i in range(10000):
 30 | 
 31 |     driver.get(next_page)
 32 | 
 33 |     soup = BeautifulSoup(driver.page_source, "html5lib")
 34 | 
 35 |     try:
 36 | 	    next_page = "https://www.residentadvisor.net/" + \
 37 | 	        soup.find("li", {"class": "but arrow-left bbox"}).find("a")['href']
 38 | 	except:
 39 | 		next_page = ""
 40 | 
 41 |     singles = soup.find(
 42 |         "div", {
 43 |             "id": "reviews"}).find_all(
 44 |         "article", {
 45 |             "class": "highlight-top"})
 46 | 
 47 |     review_links = [
 48 |         'https://www.residentadvisor.net' +
 49 |         x.find("a")['href'] for x in singles]
 50 | 
 51 |     if i == 0:
 52 |         review_links = review_links[25:]
 53 | 
 54 |     for l in review_links:
 55 |         driver.get(l)
 56 | 
 57 |         soup = BeautifulSoup(driver.page_source, 'html5lib')
 58 | 
 59 |         title = soup.find("div", {"id": "sectionHead"}).find("h1").text.strip()
 60 | 
 61 |         try:
 62 |             artist = title.split("-")[0].strip()
 63 | 
 64 |             single = title.split("-")[1].strip()
 65 |         except:
 66 |             artist = ''
 67 |             single = ''
 68 | 
 69 |         print(title)
 70 | 
 71 |         rating = soup.find("span", {"class": "rating"}).text.split("/")[0]
 72 |         reviewed_date = soup.find("span", {"itemprop": "dtreviewed"})[
 73 |             'datetime'].strip()
 74 | 
 75 |         meta_list = soup.find("ul", {"class": "clearfix"}).find_all("li")
 76 | 
 77 |         style = meta_list[2].text.split('\n')[4]
 78 |         label = str(meta_list[0]).split(
 79 |             '<br/>')[0].split('">')[-1].split('</')[0].strip()
 80 |         record = str(meta_list[0]).split('<br/>')[-1].split("</")[0].strip()
 81 |         release_date = meta_list[1].text.split('\n')[4]
 82 |         comments = meta_list[3].text.split('\n')[4].split("/")[0].strip()
 83 |         description = soup.find("span",
 84 |                                 {"itemprop": "description"}).text.strip()
 85 | 
 86 |         with open("resident-adv.csv", "a") as f:
 87 |             csv_w_interv = csv.writer(f)
 88 |             csv_w_interv.writerow([title,
 89 |                                    artist,
 90 |                                    single,
 91 |                                    label,
 92 |                                    record,
 93 |                                    style,
 94 |                                    reviewed_date,
 95 |                                    release_date,
 96 |                                    comments,
 97 |                                    rating,
 98 |                                    description,
 99 |                                    l])
100 | 
101 |         time.sleep(random.randint(1, 3))
102 | 
103 |     time.sleep(random.randint(1, 3))
104 | 


--------------------------------------------------------------------------------
/CTSNet-selenium.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver  # powers the browser interaction
  2 | from selenium.webdriver.support.ui import Select  # selects menu options
  3 | from bs4 import BeautifulSoup  # to parse HTML
  4 | import csv  # to write CSV
  5 | import pandas as pd  # to see CSV
  6 | import time
  7 | import os
  8 | import random
  9 | 
 10 | 
 11 | driver = webdriver.PhantomJS()
 12 | next_page = "https://www.ctsnet.org/surgeons/surgeons-advanced-search?ln=&fn=&subspecialty=adult_cardiac_surgery&city=&country=gb&province=&o"
 13 | 
 14 | with open("IT-cardi.csv", "a") as f:
 15 |     csv_w_interv = csv.writer(f)
 16 |     csv_w_interv.writerow(["Name",
 17 |                            "Hospital",
 18 |                            "Phone",
 19 |                            "Interests",
 20 |                            "Practice-Areas",
 21 |                            "City-Region",
 22 |                            "Country",
 23 |                            "Street", "URL"])
 24 | 
 25 | 
 26 | for i in range(1000):
 27 | 
 28 |     driver.get(next_page)
 29 | 
 30 |     soup = BeautifulSoup(driver.page_source, "html5lib")
 31 | 
 32 |     try:
 33 |         next_page = "https://www.ctsnet.org" + \
 34 |             soup.find('a', {'title': 'Go to next page'})['href']
 35 |     except:
 36 |         next_page = ""
 37 | 
 38 |     td_a = soup.find_all(
 39 |         "td", {"class": "views-field views-field-field-contact-last-name"})
 40 | 
 41 |     if i == 0:
 42 |         links = ["https://www.ctsnet.org" +
 43 |                  x.find("a")['href'] for x in td_a[48:]]
 44 |     else:
 45 |         links = ["https://www.ctsnet.org" + x.find("a")['href'] for x in td_a]
 46 | 
 47 |     for l in links:
 48 | 
 49 |         driver.get(l)
 50 |         soup = BeautifulSoup(driver.page_source, "html5lib")
 51 | 
 52 |         try:
 53 |             name = soup.find('h1', {"class": 'page-title'}).text.strip()
 54 |             print(name)
 55 |         except:
 56 |             continue
 57 | 
 58 |         try:
 59 |             hospital = soup.find(
 60 |                 'div', {
 61 |                     "class": 'contact-institution'}).text.strip()
 62 |         except:
 63 |             continue
 64 | 
 65 |         try:
 66 |             country = soup.find('div',
 67 |                                 {"class": 'contact-country'}).text.strip()
 68 | 
 69 |         except:
 70 |             country = ''
 71 | 
 72 |         try:
 73 |             street = soup.find('div', {"class": 'contact-street'}).text.strip()
 74 |         except:
 75 |             street = ''
 76 | 
 77 |         try:
 78 |             city = soup.find(
 79 |                 'div', {
 80 |                     "class": 'contact-city-province-code'}).text.strip()
 81 | 
 82 |         except:
 83 |             city = ''
 84 | 
 85 |         try:
 86 |             phone = soup.find('div', {"class": 'contact-numbers'}).text.strip()
 87 |         except:
 88 |             continue
 89 | 
 90 |         try:
 91 |             fields = soup.find(
 92 |                 'div', {
 93 |                     "class": 'views-field views-field-field-contact-subspecialty'}).text.strip().replace(
 94 |                 '\n', '; ')
 95 |         except:
 96 |             fields = ''
 97 | 
 98 |         try:
 99 | 
100 |             interests = soup.find(
101 |                 'div', {
102 |                     "class": 'field field--name-field-contact-interest field--type-text-long field--label-hidden'}).text.strip().replace(
103 |                 '\n', '; ')
104 |         except:
105 |             interests = ''
106 | 
107 |         if len(phone) > 0:
108 | 
109 |             with open("IT-cardi.csv", "a") as f:
110 |                 csv_w_interv = csv.writer(f)
111 |                 csv_w_interv.writerow(
112 |                     [name, hospital, phone, interests, fields, city, country, street, l])
113 | 
114 |         time.sleep(random.randint(1, 3))
115 |     time.sleep(random.randint(1, 3))
116 | 


--------------------------------------------------------------------------------
/Doximity-selenium.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver  # powers the browser interaction
  2 | from selenium.webdriver.support.ui import Select  # selects menu options
  3 | from bs4 import BeautifulSoup  # to parse HTML
  4 | import csv  # to write CSV
  5 | import pandas as pd  # to see CSV
  6 | import time
  7 | import os
  8 | import random
  9 | 
 10 | 
 11 | header = [
 12 |     'Name',
 13 |     'Title',
 14 |     'Hospital',
 15 |     'Phone',
 16 |     'State',
 17 |     'Tags',
 18 |     'Summary',
 19 |     'Skills',
 20 |     'City',
 21 |     'Address']
 22 | 
 23 | with open("cardi.csv", "a") as f:
 24 |     csv_w_electro = csv.writer(f)
 25 |     csv_w_electro.writerow(header)
 26 | 
 27 | driver = webdriver.PhantomJS()
 28 | next_page = "https://www.doximity.com/directory/md/specialty/thoracic-surgery?from_slug=pub%2Fmichael-peter-kaye-md"
 29 | 
 30 | for i in range(1000):
 31 | 
 32 |     driver.get(next_page)
 33 | 
 34 |     try:
 35 |         next_page = BeautifulSoup(
 36 |             driver.page_source, "html5lib").find(
 37 |             "a", {
 38 |                 "class": "next_page"})['href']
 39 |         next_page = "https://www.doximity.com" + next_page
 40 |     except:
 41 |         next_page = ""
 42 | 
 43 |     links = [a.get_attribute(
 44 |         'href') for a in driver.find_elements_by_css_selector("ul.list-4-col a")]
 45 |     links = random.sample(links, 15)
 46 | 
 47 |     for l in links:
 48 | 
 49 |         driver.get(l)
 50 |         soup = BeautifulSoup(driver.page_source, "html5lib")
 51 | 
 52 |         try:
 53 |             name = soup.find("span", {"id": "user_full_name"}).text.strip()
 54 |             print(name)
 55 |         except:
 56 |             name = ""
 57 | 
 58 |         try:
 59 |             title = soup.find("p", {"itemprop": "jobTitle"}).text.strip()
 60 |         except:
 61 |             title = ""
 62 | 
 63 |         try:
 64 |             city = soup.find(
 65 |                 "span", {
 66 |                     "itemprop": "addressLocality"}).text.strip()
 67 |         except:
 68 |             city = ""
 69 | 
 70 |         try:
 71 |             state = soup.find("span",
 72 |                               {"itemprop": "addressRegion"}).text.strip()
 73 |         except:
 74 |             state = ""
 75 | 
 76 |         try:
 77 |             address = soup.find("div", {"class": "col-1-2"}).text.strip()
 78 |         except:
 79 |             address = ""
 80 | 
 81 |         try:
 82 |             hospital = soup.find("section",
 83 |                                  {"class": "section hospital-info"}).findAll("span",
 84 |                                                                              {"itemprop": "name"})
 85 |             hospitals = '; '.join([x.text.strip() for x in hospital])
 86 |         except:
 87 |             hospitals = ""
 88 | 
 89 |         try:
 90 |             phone = soup.find("span", {"itemprop": "telephone"}).text.strip()
 91 |         except:
 92 |             phone = ""
 93 | 
 94 |         try:
 95 |             summary = soup.find(
 96 |                 "section", {
 97 |                     "class": "section summary-info"}).find("ul").text.strip()
 98 |         except:
 99 |             summary = ""
100 | 
101 |         try:
102 |             skills = soup.find(
103 |                 "div", {
104 |                     "class": "section skills-info"}).find("ul").text.strip()
105 |         except:
106 |             skills = ""
107 | 
108 |         try:
109 |             tags = soup.find("div", {"class": "section"}).find(
110 |                 "p").text.strip()
111 | 
112 |             if len(phone) > 0:
113 |                 if "cardi" in tags.lower():
114 |                     with open("cardi.csv", "a") as f:
115 |                         csv_w_electro = csv.writer(f)
116 |                         csv_w_electro.writerow(
117 |                             [name, title, hospitals, phone, state, tags, summary, skills, city, address])
118 | 
119 |         except:
120 |             pass
121 | 
122 |         time.sleep(random.randint(1, 3))
123 | 
124 |     time.sleep(random.randint(1, 3))
125 | 


--------------------------------------------------------------------------------
/RateMyProfessors-selenium.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver  # powers the browser interaction
  2 | from selenium.webdriver.support.ui import Select  # selects menu options
  3 | from selenium.webdriver.common.keys import Keys
  4 | from bs4 import BeautifulSoup  # to parse HTML
  5 | import csv  # to write CSV
  6 | import pandas as pd  # to see CSV
  7 | import time
  8 | import os
  9 | import random
 10 | 
 11 | 
 12 | header = ['Prof_Name',
 13 |           'Title',
 14 |           'School',
 15 |           'Overall_Quality',
 16 |           'Overall_Take_Again',
 17 |           'Overall_Difficulty',
 18 |           'Overall_Hot',
 19 |           'Comment_Date',
 20 |           'Rating_Type',
 21 |           'Course',
 22 |           'Quality',
 23 |           'Difficulty',
 24 |           'Credit',
 25 |           'Attendance',
 26 |           'Textbook',
 27 |           'Take_Again',
 28 |           'Grade',
 29 |           'Comment',
 30 |           'Helpful',
 31 |           'Not_Helpful',
 32 |           'URL']
 33 | 
 34 | with open("rmp.csv", "a") as f:
 35 |     csv_w = csv.writer(f)
 36 |     csv_w.writerow(header)
 37 | 
 38 | base_url = 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid='
 39 | 
 40 | driver = webdriver.PhantomJS()
 41 | driver.get(base_url + str(random.randint(1, 500000)))
 42 | driver.find_element_by_css_selector('a.btn.close-this').click()
 43 | 
 44 | for i in range(500000):
 45 |     url = base_url + str(random.randint(1, 500000))
 46 |     driver.get(url)
 47 | 
 48 |     try:
 49 |         soup = BeautifulSoup(driver.page_source, 'html5lib')
 50 |         comment_table = soup.find('table', {'class': 'tftable'})
 51 |         comments = comment_table.find_all('tr')[1:]
 52 |     except:
 53 |         continue
 54 | 
 55 |     prof_name = ' '.join(
 56 |         soup.find(
 57 |             'h1', {
 58 |                 'class': 'profname'}).text.strip().split())
 59 |     print(prof_name)
 60 |     school = soup.find('a', {'class': 'school'}).text.strip()
 61 |     title = ' '.join(
 62 |         soup.find(
 63 |             'div', {
 64 |                 'class': 'result-title'}).text.strip().split()).split(' are you')[0]
 65 | 
 66 |     overall = soup.find_all('div', {'class': 'grade'})[:3]
 67 |     o_quality, o_take_again, o_difficulty = [x.text.strip() for x in overall]
 68 |     o_hot = soup.find_all('div', {'class': 'grade'})[3].find('img')[
 69 |         'src'].split('/')[-1].split('.')[0]
 70 | 
 71 |     all_rows = []
 72 |     for c in comments:
 73 |         try:
 74 |             date = c.find('div', {'class': 'date'}).text.strip()
 75 |             rating_type = c.find('span', {'class': 'rating-type'}).text.strip()
 76 |             course = c.find('span', {'class': 'name'}).text.strip()
 77 |             credit = c.find('span', {'class': 'credit'}
 78 |                             ).text.strip().split(':')[1].strip()
 79 |             attendance = c.find(
 80 |                 'span', {
 81 |                     'class': 'attendance'}).text.strip().split(':')[1].strip()
 82 |             textbook = c.find(
 83 |                 'span', {
 84 |                     'class': 'textbook-used'}).text.strip().split(':')[1].strip()
 85 |             take_again = c.find(
 86 |                 'span', {
 87 |                     'class': 'would-take-again'}).text.strip().split(':')[1].strip()
 88 |             grade = c.find('span', {'class': 'grade'}
 89 |                            ).text.strip().split(':')[1].strip()
 90 | 
 91 |             brkdown = c.find(
 92 |                 'div', {
 93 |                     'class': 'breakdown'}).find_all(
 94 |                 'div', {
 95 |                     'class': 'descriptor-container'})
 96 |             quality, difficulty = [x.text.strip().split()[0] for x in brkdown]
 97 | 
 98 |             helpful = c.find('a', {'class': 'helpful'}).find(
 99 |                 'span', {'class': 'count'}).text.strip()
100 |             not_helpful = c.find(
101 |                 'a', {
102 |                     'class': 'nothelpful'}).find(
103 |                 'span', {
104 |                     'class': 'count'}).text.strip()
105 | 
106 |             comment = c.find('p', {'class': 'commentsParagraph'}).text
107 | 
108 |             row = [prof_name,
109 |                    title,
110 |                    school,
111 |                    o_quality,
112 |                    o_take_again,
113 |                    o_difficulty,
114 |                    o_hot,
115 |                    date,
116 |                    rating_type,
117 |                    course,
118 |                    quality,
119 |                    difficulty,
120 |                    credit,
121 |                    attendance,
122 |                    textbook,
123 |                    take_again,
124 |                    grade,
125 |                    comment,
126 |                    helpful,
127 |                    not_helpful,
128 |                    url]
129 | 
130 |             all_rows.append(row)
131 | 
132 |         except:
133 |             pass
134 | 
135 |     with open("rmp.csv", "a") as f:
136 |         csv_w = csv.writer(f)
137 |         csv_w.writerows(all_rows)
138 | 
139 |     time.sleep(random.randint(1, 3))
140 | 


--------------------------------------------------------------------------------
/DataMartBasicSkills-req.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | from urllib import parse
  4 | import json
  5 | import pickle
  6 | import time
  7 | import re
  8 | import glob
  9 | 
 10 | 
 11 | class BasicSkillsCollege:
 12 | 
 13 |     def __init__(self, college):
 14 | 
 15 |         self.sess = requests.session()
 16 |         self.url = 'http://datamart.cccco.edu/Outcomes/BasicSkills_Cohort_Tracker.aspx'
 17 |         self.init_req = self.sess.get(self.url)
 18 |         self.init_req_soup = BeautifulSoup(self.init_req.content, 'html5lib')
 19 |         self.init_states = {tag['name']: tag['value']
 20 |                             for tag in self.init_req_soup.select('input[name^=__]')}
 21 |         self.college = college
 22 |         print(self.college)
 23 | 
 24 |     def parse_params(self, r):
 25 |         lst = re.search(r'\[.+\]', r.text).group()
 26 |         terms = lst.replace(
 27 |             '"',
 28 |             '').replace(
 29 |             '[',
 30 |             '').replace(
 31 |             ']',
 32 |             '').replace(
 33 |                 "'",
 34 |             "").split(',')
 35 |         terms = [x.strip() for x in terms]
 36 | 
 37 |         tps = []
 38 |         for i in range(len(terms)):
 39 |             if (i + 2) % 2 == 0:
 40 |                 tps.append((terms[i + 1], terms[i]))
 41 | 
 42 |         return tps
 43 | 
 44 |     def get_s_terms(self):
 45 |         data = self.init_states
 46 |         data['__CALLBACKID'] = 'ASPxRoundPanel1$ASPxComboBoxSTerm'
 47 |         data['__CALLBACKPARAM'] = 'c0:LECC|0;;LBCRI|4;0:-2;'
 48 |         data['DXScript'] = '1_243,1_138,1_237,1_164,1_141,1_135,1_226,1_234,1_162,1_170,1_161,1_229,1_159,1_227,1_165,1_143,1_176,1_151,1_232,1_149,7_50,7_53,7_48,7_52,1_235,1_218,1_228,1_210,1_184,1_136'
 49 |         data['DXCss'] = '0_224,1_28,0_226,0_115,1_10,0_117,0_143,7_2,0_145,../css/styles.css,../css/navigation-mininav.css,../css/design01.css,../css/footer-without-dark-container.css'
 50 |         data['ASPxRoundPanel1$ASPxComboBoxColl'] = self.college[0]
 51 |         data['ASPxRoundPanel1_ASPxComboBoxColl_VI'] = self.college[1]
 52 |         data['ASPxRoundPanel1$ASPxComboBoxColl$DDD$L'] = self.college[1]
 53 | 
 54 |         req = self.sess.post(self.url, data=data)
 55 | 
 56 |         sterms = self.parse_params(req)
 57 |         spring_2006 = [x[0] for x in sterms].index('Spring 2006')
 58 |         sterms = sterms[:spring_2006 + 1][::-1]
 59 | 
 60 |         return (data, sterms)
 61 | 
 62 |     def get_skills(self):
 63 |         data, sterms = self.get_s_terms()
 64 |         data['__CALLBACKID'] = 'ASPxRoundPanel1$ASPxComboBoxBSSub'
 65 |         data['ASPxRoundPanel1$ASPxComboBoxSTerm'] = sterms[0][0]
 66 |         data['ASPxRoundPanel1_ASPxComboBoxSTerm_VI'] = sterms[0][1]
 67 |         data['ASPxRoundPanel1$ASPxComboBoxSTerm$DDD$L'] = sterms[0][1]
 68 |         data['ASPxRoundPanel1$ASPxComboBoxETerm'] = sterms[0][0]
 69 |         data['ASPxRoundPanel1_ASPxComboBoxETerm_VI'] = sterms[0][1]
 70 |         data['ASPxRoundPanel1$ASPxComboBoxETerm$DDD$L'] = sterms[0][1]
 71 | 
 72 |         req = self.sess.post(self.url, data=data)
 73 |         skills = self.parse_params(req)
 74 | 
 75 |         return (data, sterms, skills)
 76 | 
 77 |     def get_levels(self):
 78 |         data, sterms, skills = self.get_skills()
 79 |         college_params = []
 80 |         for i in range(len(sterms)):
 81 |             params = {}
 82 |             for i2 in range(len(sterms) - i):
 83 |                 for i3 in range(len(skills)):
 84 |                     if "ESL" not in skills[i3][0]:
 85 |                         params['sterm'] = sterms[i]
 86 |                         params['eterm'] = sterms[i2 + i]
 87 |                         params['skill'] = skills[i3]
 88 |                         data['__CALLBACKID'] = 'ASPxRoundPanel1$ASPxComboBoxPL'
 89 |                         data['ASPxRoundPanel1$ASPxComboBoxSTerm'] = params[
 90 |                             'sterm'][0]
 91 |                         data['ASPxRoundPanel1_ASPxComboBoxSTerm_VI'] = params[
 92 |                             'sterm'][1]
 93 |                         data['ASPxRoundPanel1$ASPxComboBoxSTerm$DDD$L'] = params[
 94 |                             'sterm'][1]
 95 |                         data['ASPxRoundPanel1$ASPxComboBoxETerm'] = params[
 96 |                             'eterm'][0]
 97 |                         data['ASPxRoundPanel1_ASPxComboBoxETerm_VI'] = params[
 98 |                             'eterm'][1]
 99 |                         data['ASPxRoundPanel1$ASPxComboBoxETerm$DDD$L'] = params[
100 |                             'eterm'][1]
101 |                         data['ASPxRoundPanel1$ASPxComboBoxBSSub'] = params[
102 |                             'skill'][0]
103 |                         data['ASPxRoundPanel1_ASPxComboBoxBSSub_VI'] = params[
104 |                             'skill'][1]
105 |                         data['ASPxRoundPanel1$ASPxComboBoxBSSub$DDD$L'] = params[
106 |                             'skill'][1]
107 | 
108 |                         req = self.sess.post(self.url, data=data)
109 | 
110 |                         try:
111 |                             levels = self.parse_params(req)
112 | 
113 |                             for l in levels:
114 |                                 params['sterm'] = sterms[i]
115 |                                 params['eterm'] = sterms[i2 + i]
116 |                                 params['skill'] = skills[i3]
117 |                                 params['level'] = l
118 |                                 college_params.append(params)
119 |                                 params = {}
120 | 
121 |                         except:
122 |                             pass
123 | 
124 |         pickle.dump(
125 |             college_params,
126 |             open('./pickles/' + self.college[0] + '.pkl', 'wb'))
127 |         return college_params
128 | 
129 |     def dl_csv(self):
130 |         config = pickle.load(
131 |             open(
132 |                 './pickles/' +
133 |                 self.college[0] +
134 |                 '.pkl',
135 |                 'rb'))
136 |         num_configs = len(config)
137 | 
138 |         params_json = json.load(open('pickles/dump.HAR'))
139 |         params1 = {parse.unquote(d['name']): parse.unquote(d['value']) for d in params_json[
140 |             'log']['entries'][-6]['request']['postData']['params']}
141 |         params2 = {parse.unquote(d['name']): parse.unquote(d['value']) for d in params_json[
142 |             'log']['entries'][-1]['request']['postData']['params']}
143 | 
144 |         headers = {d['name']: d['value'] for d in params_json[
145 |             'log']['entries'][-1]['request']['headers']}
146 |         del headers['Content-Length']
147 |         del headers['Cookie']
148 | 
149 |         cookies = {'Cookie': 'ASP.NET_SessionId' + '=' +
150 |                    self.init_req.cookies.get_dict()['ASP.NET_SessionId']}
151 |         self.sess.headers.update(cookies)
152 | 
153 |         data = self.init_states
154 | 
155 |         for k in data.keys():
156 |             params1[k] = data[k]
157 |             params2[k] = data[k]
158 | 
159 |         for i, c in enumerate(config):
160 |             print(i, num_configs, c)
161 | 
162 |             for p in (params1, params2):
163 |                 p['ASPxRoundPanel1$ASPxComboBoxColl'] = self.college[0]
164 |                 p['ASPxRoundPanel1_ASPxComboBoxColl_VI'] = self.college[1]
165 |                 p['ASPxRoundPanel1$ASPxComboBoxColl$DDD$L'] = self.college[1]
166 |                 p['ASPxRoundPanel1$ASPxComboBoxSTerm'] = c['sterm'][0]
167 |                 p['ASPxRoundPanel1_ASPxComboBoxSTerm_VI'] = c['sterm'][1]
168 |                 p['ASPxRoundPanel1$ASPxComboBoxSTerm$DDD$L'] = c['sterm'][1]
169 |                 p['ASPxRoundPanel1$ASPxComboBoxETerm'] = c['eterm'][0]
170 |                 p['ASPxRoundPanel1_ASPxComboBoxETerm_VI'] = c['eterm'][1]
171 |                 p['ASPxRoundPanel1$ASPxComboBoxETerm$DDD$L'] = c['eterm'][1]
172 |                 p['ASPxRoundPanel1$ASPxComboBoxBSSub'] = c['skill'][0]
173 |                 p['ASPxRoundPanel1_ASPxComboBoxBSSub_VI'] = c['skill'][1]
174 |                 p['ASPxRoundPanel1$ASPxComboBoxBSSub$DDD$L'] = c['skill'][1]
175 |                 p['ASPxRoundPanel1$ASPxComboBoxPL'] = c['level'][0]
176 |                 p['ASPxRoundPanel1_ASPxComboBoxPL_VI'] = c['level'][1]
177 |                 p['ASPxRoundPanel1$ASPxComboBoxPL$DDD$L'] = c['level'][1]
178 | 
179 |             params2['__EVENTTARGET'] = 'buttonSaveAs'
180 |             params2['listExportFormat'] = '1'
181 | 
182 |             # need to start sesh
183 |             r = self.sess.post(self.url, data=params1)
184 | 
185 |             # now get full report
186 |             r = self.sess.post(self.url, data=params2)
187 | 
188 |             with open("data/" + self.college[0] + '-' + c['sterm'][1] + '-' + c['eterm'][1] + '-' + c['skill'][1] + '-' + c['level'][1] + '.csv', 'w') as f:
189 |                 f.write(r.text)
190 | 
191 |             pickle.dump(config[i + 1:],
192 |                         open('./pickles/' + self.college[0] + '.pkl', 'wb'))
193 | 
194 |             time.sleep(1)
195 | 
196 | if __name__ == "__main__":
197 |     colleges = pickle.load(open('./pickles/college_list.pkl', 'rb'))
198 |     colleges = colleges[:5]
199 | 
200 |     for c in colleges:
201 |         if not './pickles/' + c[0] + '.pkl' in glob.glob('./pickles/*.pkl'):
202 |             BasicSkillsCollege((c[0], c[1])).get_levels()
203 | 
204 |         BasicSkillsCollege((c[0], c[1])).dl_csv()
205 | 


--------------------------------------------------------------------------------
/PACER-selenium.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | import random
  3 | import time
  4 | from bs4 import BeautifulSoup
  5 | import os
  6 | import csv
  7 | import datetime
  8 | from send_email import send_email
  9 | import glob
 10 | import subprocess
 11 | import re
 12 | from pyvirtualdisplay import Display
 13 | import sys
 14 | 
 15 | 
 16 | def sift_chars(fname_str):
 17 |     '''
 18 |     ensures filename is legal, replaces all with hyphens
 19 |     '''
 20 | 
 21 |     illegal_chars = "%><!`&*‘|{}?“=\/:@'" + '"'
 22 | 
 23 |     for c in illegal_chars:
 24 |         fname_str = fname_str.replace(c, "-")
 25 | 
 26 |     return fname_str
 27 | 
 28 | 
 29 | def login_to_pacer(login_user, login_password, dl_directory):
 30 |     '''
 31 |     simply logs into the pacer system for any court
 32 |     '''
 33 | 
 34 |     display = Display(visible=0, size=(800, 600))
 35 |     display.start()
 36 | 
 37 |     # CHROME setup driver
 38 |     chrome_profile = webdriver.ChromeOptions()
 39 |     profile = {
 40 |         "download.default_directory": dl_directory,
 41 |         "download.prompt_for_download": False,
 42 |         "directory_upgrade": True,
 43 |         "download.directory_upgrade": True,
 44 |         "plugins.always_open_pdf_externally": True}
 45 |     chrome_profile.add_experimental_option("prefs", profile)
 46 |     driver = webdriver.Chrome(
 47 |         chrome_options=chrome_profile,
 48 |         executable_path="./chromedriver")
 49 | 
 50 |     # FIREFOX
 51 |     # profile = webdriver.FirefoxProfile()
 52 |     # profile.set_preference('browser.download.folderList', 2)
 53 |     # profile.set_preference('browser.download.manager.showWhenStarting', False)
 54 |     # profile.set_preference("browser.download.dir", dl_directory)
 55 |     # profile.set_preference(
 56 |     # "browser.helperApps.neverAsk.saveToDisk",
 57 |     # "application/pdf")
 58 |     # profile.set_preference("pdfjs.disabled", True)
 59 |     # profile.set_preference("plugin.scan.plid.all", False)
 60 |     # profile.set_preference("plugin.scan.Acrobat", "99")
 61 |     # driver = webdriver.Firefox(firefox_profile=profile,
 62 |     # firefox_binary='./firefox/firefox',
 63 |     # executable_path="./geckodriver")
 64 | 
 65 |     # login to pacer website
 66 |     login_url = "https://pacer.login.uscourts.gov/csologin/login.jsf"
 67 |     driver.get(login_url)
 68 |     driver.find_element_by_name("login:loginName").send_keys(login_user)
 69 |     driver.find_element_by_name("login:password").send_keys(login_password)
 70 |     driver.find_element_by_name('login:fbtnLogin').click()
 71 | 
 72 |     time.sleep(1)
 73 | 
 74 |     return driver
 75 | 
 76 | 
 77 | def get_docket_rows(driver, case_num, year, court_perl):
 78 |     '''
 79 |     goes to specific court perl script and extracts docket rows for specific case.
 80 |     can be used for main case and associated cases
 81 |     '''
 82 | 
 83 |     # execute if not getting rows for associated cases
 84 |     if case_num:
 85 |         # Western District Texas Court, start loop for cases here
 86 |         driver.get(court_perl)
 87 |         driver.find_element_by_name("case_num").clear()
 88 |         driver.find_element_by_name("case_num").send_keys(case_num)
 89 |         driver.find_element_by_id('case_number_find_button_0').click()
 90 | 
 91 |         # wait until case is found and number is changed
 92 |         count = 0
 93 |         while driver.find_element_by_name(
 94 |                 "case_num").get_attribute('value') == case_num:
 95 |             time.sleep(1)
 96 |             count += 1
 97 |             if count > 30:
 98 |                 break
 99 | 
100 |     time.sleep(1)  # case has been found, proceed
101 | 
102 |     driver.find_element_by_name("date_from").clear()
103 |     driver.find_element_by_name("date_from").send_keys("01/01/1990")
104 |     driver.find_element_by_name("date_to").clear()
105 |     driver.find_element_by_name("date_to").send_keys(
106 |         datetime.date.today().strftime("%m/%d/%Y"))
107 | 
108 |     time.sleep(1)
109 |     driver.find_element_by_name('button1').click()
110 | 
111 |     # get source to get docket info
112 |     docket_source = str(driver.page_source)
113 |     soup = BeautifulSoup(docket_source, 'html5lib')
114 | 
115 |     # set start for row, will change if scrape was interrupted
116 |     row_start = 0
117 | 
118 |     # get associated cases if main case
119 |     if case_num:
120 | 
121 |         with open(district + "/" + je_id + "/" + je_id + "_data.csv", 'r', encoding="utf-8") as f:
122 |             reader = csv.reader(f)
123 |             data = list(reader)
124 | 
125 |         if len(data) == 1:
126 |             get_associated_cases(soup)
127 |             # save docket source if main case
128 |             with open(district + "/" + je_id + "/" + str(je_id) + ".html", "w", encoding="utf-8") as f:
129 |                 f.write(docket_source)
130 | 
131 |         else:
132 |             row_start = len(data) - 1
133 | 
134 |     else:
135 | 
136 |         if os.path.exists(
137 |                 district +
138 |                 "/" +
139 |                 je_id +
140 |                 "/associated/" +
141 |                 str(case_num) +
142 |                 "/" +
143 |                 'assoc_data.csv'):
144 |             with open(district + "/" + je_id + "/associated/" + str(case_num) + "/" + 'assoc_data.csv', 'r', encoding="utf-8") as f:
145 |                 reader = csv.reader(f)
146 |                 data = list(reader)
147 | 
148 |             row_start = len(data) - 1
149 | 
150 |     docket_rows = []
151 |     for i in range(len(soup.findAll("table")) - 5):
152 |         # table is broken up to sets of 100 rows, don't want first 4 or last
153 |         ind = i + 4
154 |         docket_table = soup.findAll("table")[ind]
155 |         docket_headers = ("Filing Date", "#", "Docket Text")
156 | 
157 |         # get table info in dict
158 |         for row in docket_table.findAll("tr"):
159 |             row_data = []
160 |             for i, column in enumerate(row.findAll("td")):
161 |                 if i == 0:
162 |                     row_data.append(column.text)
163 |                 elif i == 2:
164 |                     cell_urls = {}
165 |                     urls = column.findAll("a")
166 |                     for u in urls:
167 |                         cell_urls[u.text.strip()] = u.get("href")
168 | 
169 |                     row_data.append((column.text.strip(), cell_urls))
170 | 
171 |                 elif i > 2:
172 |                     row_data.append(column.text.strip())
173 | 
174 |             if len(row_data) > 0:
175 |                 docket_rows.append(tuple(row_data))
176 | 
177 |     return docket_rows[row_start:]
178 | 
179 | 
180 | def process_link(
181 |         link_str,
182 |         base_url,
183 |         district,
184 |         already_scraped,
185 |         adversary=False,
186 |         dock_num=False):
187 |     '''
188 |     takes any links to documents, and downloads them into file structure
189 |     '''
190 | 
191 |     if link_str.startswith("https://"):
192 |         pass
193 |     else:
194 |         link_str = base_url + link_str
195 | 
196 |     driver.get(link_str)
197 |     f_paths = []
198 | 
199 |     if "Multiple Documents" in str(driver.page_source):
200 |         soup = BeautifulSoup(str(driver.page_source), 'html5lib')
201 |         doc_table = soup.findAll("tr")
202 |         for r in doc_table:
203 |             if "href" in str(r):
204 |                 tds = r.findAll("td")
205 |                 doc_url = tds[0].a["href"]
206 |                 dl_id = doc_url.split("/")[-1]
207 |                 if dl_id not in already_scraped:
208 |                     os.system('rm *.pdf')
209 |                     if doc_url.startswith("https://"):
210 |                         driver.get(doc_url)
211 |                         driver.find_element_by_xpath(
212 |                             '//*[@id="cmecfMainContent"]/form/input').click()
213 |                     else:
214 |                         doc_url = base_url + doc_url
215 |                         driver.get(doc_url)
216 |                         driver.find_element_by_xpath(
217 |                             '//*[@id="cmecfMainContent"]/form/input').click()
218 | 
219 |                     file_name = tds[2].text
220 |                     new_name = sift_chars(file_name.strip()) + ".pdf"
221 | 
222 |                     # if not associated case
223 |                     # create file structure
224 |                     if not adversary:
225 |                         if not os.path.exists(
226 |                                 district + "/" + je_id + "/" + docket_number):
227 |                             os.makedirs(
228 |                                 district + "/" + je_id + "/" + docket_number)
229 | 
230 |                         new_path = district + "/" + je_id + "/" + docket_number + "/" + new_name
231 | 
232 |                     else:
233 |                         if not os.path.exists(
234 |                                 district +
235 |                                 "/" +
236 |                                 je_id +
237 |                                 "/associated/" +
238 |                                 adversary +
239 |                                 "/" +
240 |                                 dock_num):
241 |                             os.makedirs(
242 |                                 district +
243 |                                 "/" +
244 |                                 je_id +
245 |                                 "/associated/" +
246 |                                 adversary +
247 |                                 "/" +
248 |                                 dock_num)
249 | 
250 |                         new_path = district + "/" + je_id + "/associated/" + \
251 |                             adversary + "/" + dock_num + "/" + new_name
252 | 
253 |                     # wait for file to download
254 |                     counter = 0
255 |                     while len(glob.glob("*.pdf")) == 0:
256 |                         time.sleep(1)
257 |                         counter += 1
258 |                         if counter > 500:
259 |                             break
260 | 
261 |                     time.sleep(4)
262 |                     download_name = glob.glob("*.pdf")[0]
263 |                     os.rename(
264 |                         download_name, re.sub(
265 |                             r'[^\x00-\x7f]', '-', new_path))
266 | 
267 |                     already_scraped.append(dl_id)
268 |                     f_paths.append(new_path)
269 | 
270 |                     time.sleep(1)
271 |                     os.system('rm *.pdf')
272 |                     time.sleep(1)
273 | 
274 |     else:
275 |         soup = BeautifulSoup(str(driver.page_source), 'html5lib')
276 | 
277 |         restricted = False
278 | 
279 |         try:
280 |             dl_id = soup.find("form")["action"].split("/")[-1]
281 | 
282 |         except:
283 |             if "The document is restricted" in driver.page_source:
284 |                 restricted = True
285 |             elif "document is not available" in driver.page_source:
286 |                 restricted = True
287 | 
288 |         if not restricted:
289 |             os.system('rm *.pdf')
290 |             driver.find_element_by_xpath(
291 |                 '//*[@id="cmecfMainContent"]/form/input').click()
292 | 
293 |             if dl_id not in already_scraped:
294 | 
295 |                 # create file structure
296 |                 if not adversary:
297 |                     if not os.path.exists(
298 |                             district + "/" + je_id + "/" + docket_number):
299 |                         os.makedirs(
300 |                             district + "/" + je_id + "/" + docket_number)
301 | 
302 |                     new_path = district + "/" + je_id + "/" + docket_number + "/Main Document.pdf"
303 | 
304 |                 else:
305 |                     if not os.path.exists(
306 |                             district +
307 |                             "/" +
308 |                             je_id +
309 |                             "/associated/" +
310 |                             adversary +
311 |                             "/" +
312 |                             dock_num):
313 |                         os.makedirs(
314 |                             district +
315 |                             "/" +
316 |                             je_id +
317 |                             "/associated/" +
318 |                             adversary +
319 |                             "/" +
320 |                             dock_num)
321 | 
322 |                     new_path = district + "/" + je_id + "/associated/" + \
323 |                         adversary + "/" + dock_num + "/Main Document.pdf"
324 | 
325 |                 # wait for file to download
326 |                 counter = 0
327 |                 while len(glob.glob("*.pdf")) == 0:
328 |                     time.sleep(1)
329 |                     counter += 1
330 |                     if counter > 500:
331 |                         break
332 | 
333 |                 time.sleep(4)
334 |                 download_name = glob.glob("*.pdf")[0]
335 |                 os.rename(
336 |                     download_name, re.sub(
337 |                         r'[^\x00-\x7f]', '-', new_path))
338 | 
339 |                 already_scraped.append(dl_id)
340 |                 f_paths.append(new_path)
341 | 
342 |                 time.sleep(1)
343 |                 os.system('rm *.pdf')
344 |                 time.sleep(1)
345 | 
346 |         else:
347 |             f_paths.append("RESTRICTED")
348 |             time.sleep(5)
349 | 
350 |     return (f_paths, already_scraped)
351 | 
352 | 
353 | def get_associated_cases(soup):
354 | 
355 |     ass_exist = True
356 | 
357 |     try:
358 |         ass_cases_ext = soup.findAll("div", {"class": "noprint"})[
359 |             1].find("a")["href"]
360 | 
361 |     except:
362 |         ass_exist = False
363 | 
364 |     if ass_exist:
365 |         driver.get(base_url + ass_cases_ext)
366 |         driver.find_element_by_xpath('//*[@id="referrer_form"]/p/a').click()
367 |         soup = BeautifulSoup(str(driver.page_source), "html5lib")
368 | 
369 |         assoc_rows = soup.find("table").findAll("tr")
370 | 
371 |         if not os.path.exists(district + "/" + je_id + "/" + "associated"):
372 |             os.makedirs(district + "/" + je_id + "/" + "associated")
373 | 
374 |         with open(district + "/" + je_id + "/" + str(je_id) + "_associated_cases.html", "w", encoding="utf-8") as f:
375 |             f.write(str(driver.page_source))
376 | 
377 |         # if interrupted start from where last row
378 |         if os.path.exists(
379 |                 str(district) +
380 |                 "/" +
381 |                 str(je_id) +
382 |                 "/" +
383 |                 str(je_id) +
384 |                 '_associated_cases.csv'):
385 |             with open(str(district) + "/" + str(je_id) + "/" + str(je_id) + '_associated_cases.csv', 'r', encoding="utf-8") as f:
386 |                 reader = csv.reader(f)
387 |                 data = list(reader)
388 | 
389 |             if len(data) - 1 == len(assoc_rows):
390 |                 assoc_rows = [assoc_rows[-1]]
391 |             else:
392 |                 assoc_rows = assoc_rows[len(data) - 2:]
393 | 
394 |         else:
395 |             with open(str(district) + "/" + str(je_id) + "/" + str(je_id) + '_associated_cases.csv', 'a', encoding="utf-8") as f:
396 |                 w = csv.writer(f, delimiter=',')
397 |                 header = (
398 |                     "je_id",
399 |                     "Related Case No",
400 |                     "Caption",
401 |                     "Type",
402 |                     "Judge",
403 |                     "Plaintiff",
404 |                     "Defendant",
405 |                     "Plaintiff Lawyer",
406 |                     "Defendant Lawyer",
407 |                     "Date Filed",
408 |                     "Date Terminated",
409 |                     "Nature of Suit")
410 |                 w.writerow(header)
411 | 
412 |         for row in assoc_rows:  # CHANGE FOR FULL
413 |             columns = row.findAll("td")
414 |             if len(columns) > 0:
415 | 
416 |                 case_ext = columns[1].find("a")["href"]
417 |                 case_num = columns[1].find("a").text
418 |                 caption = ' '.join(columns[1].text.split()[1:])
419 |                 case_type = columns[2].text
420 | 
421 |                 row_to_write = (je_id, case_num, caption, case_type)
422 | 
423 |                 with open(str(district) + "/" + str(je_id) + "/" + str(je_id) + '_associated_cases.csv', 'a', encoding="utf-8") as f:
424 |                     w = csv.writer(f, delimiter=',')
425 |                     w.writerow(row_to_write)
426 | 
427 |                 driver.get(base_url + case_ext)
428 | 
429 |                 docket_rows = get_docket_rows(
430 |                     driver=driver,
431 |                     case_num=False,
432 |                     year=False,
433 |                     court_perl=False)
434 | 
435 |                 if not os.path.exists(
436 |                         district + "/" + je_id + "/associated/" + case_num):
437 |                     os.makedirs(
438 |                         district + "/" + je_id + "/associated/" + case_num)
439 | 
440 |                 with open(district + "/" + je_id + "/associated/" + case_num + "/" + str(case_num) + ".html", "w", encoding="utf-8") as f:
441 |                     f.write(str(driver.page_source))
442 | 
443 |                 if os.path.exists(
444 |                         district +
445 |                         "/" +
446 |                         je_id +
447 |                         "/associated/" +
448 |                         str(case_num) +
449 |                         "/" +
450 |                         'assoc_data.csv'):
451 |                     with open(district + "/" + je_id + "/associated/" + str(case_num) + "/" + 'assoc_data.csv', 'r', encoding="utf-8") as f:
452 |                         reader = csv.reader(f)
453 |                         data = list(reader)
454 | 
455 |                     docket_rows = docket_rows[len(data) - 1:]
456 | 
457 |                 else:
458 |                     with open(district + "/" + je_id + "/associated/" + str(case_num) + "/" + 'assoc_data.csv', 'a', encoding="utf-8") as f:
459 |                         w = csv.writer(f, delimiter=',')
460 |                         header = (
461 |                             "je_id",
462 |                             "case_num",
463 |                             "docket_text",
464 |                             "docket_number",
465 |                             "docket_date",
466 |                             "file_link",
467 |                             "[lawfirm1]",
468 |                             "[lawyers1]",
469 |                             "[lawfirm2]",
470 |                             "[lawyers2]",
471 |                             "[lawfirm3]",
472 |                             "[lawyers3]",
473 |                             "[moving party]",
474 |                             "[motion caption]")
475 |                         w.writerow(header)
476 | 
477 |                 for row in docket_rows:  # just 20 rows CHANGE FOR FULL
478 |                     docket_date = row[0]
479 |                     docket_text = row[2].strip()
480 |                     if len(
481 |                             row[1]) > 1 and len(
482 |                             row[1][0]) > 0 and row[1][0][0].isdigit():
483 |                         docket_number = row[1][0].split()[0]
484 | 
485 |                     else:
486 |                         with open(district + "/" + je_id + "/associated/" + str(case_num) + "/" + 'assoc_data.csv', 'r', encoding="utf-8") as f:
487 |                             reader = csv.reader(f)
488 |                             temp_data = list(reader)
489 |                         docket_number = temp_data[-1][-3]
490 | 
491 |                     already_scraped = []
492 |                     paths = []
493 |                     for c in row:
494 |                         if len(c) > 1 and isinstance(
495 |                                 c[1], dict) and len(
496 |                                 c[1]) > 0:
497 |                             for k in c[1].keys():
498 |                                 url = c[1][k]
499 |                                 res = process_link(
500 |                                     link_str=url,
501 |                                     base_url=base_url,
502 |                                     district=district,
503 |                                     already_scraped=already_scraped,
504 |                                     dock_num=docket_number,
505 |                                     adversary=case_num)
506 |                                 file_paths = res[0]
507 |                                 if len(file_paths) > 0:
508 |                                     already_scraped = res[1]
509 |                                     paths.extend(file_paths)
510 | 
511 |                                 # wait after each link call
512 |                                 time.sleep(random.randint(1, 3))
513 | 
514 |                     csv_row = [
515 |                         je_id,
516 |                         case_num,
517 |                         docket_text,
518 |                         docket_number,
519 |                         docket_date,
520 |                         "; ".join(paths)]
521 |                     scraped_data[district].append(csv_row)
522 | 
523 |                     with open(district + "/" + je_id + "/associated/" + str(case_num) + "/" + 'assoc_data.csv', 'a', encoding="utf-8") as f:
524 |                         w = csv.writer(f, delimiter=',')
525 |                         w.writerow(csv_row)
526 | 
527 |             time.sleep(random.randint(1, 3))
528 | 
529 | 
530 | # In[ ]:
531 | 
532 | # main program
533 | # for case num info
534 | with open('dataset.csv', 'r', encoding="utf-8") as f:
535 |     reader = csv.reader(f)
536 |     data = list(reader)
537 | 
538 | with open('distlogin.csv', 'r', encoding="utf-8") as f:
539 |     reader = csv.reader(f)
540 |     distlogin_csv = list(reader)
541 | 
542 | with open('completed', 'r') as f:
543 |     completed_cases = f.read().split('\n')
544 | 
545 | email_address = distlogin_csv[0][0]
546 | email_password = distlogin_csv[0][1]
547 | dl_directory = distlogin_csv[0][2]
548 | district = distlogin_csv[1][0]
549 | 
550 | # change for each district
551 | dist_data = [x for x in data if x[-2] == district]
552 | district = ''.join(district.split())
553 | 
554 | distlogin = {}
555 | 
556 | for r in distlogin_csv[1:]:
557 |     distlogin[district] = {"login": r[1],
558 |                            "pw": r[2],
559 |                            "base_url": r[3]}
560 | 
561 | # prepare and loop
562 | scraped_data = {}
563 | scraped_data[district] = []
564 | 
565 | if not os.path.exists(district):
566 |     os.makedirs(district)
567 | 
568 | driver = login_to_pacer(
569 |     login_user=distlogin[district]["login"],
570 |     login_password=distlogin[district]["pw"],
571 |     dl_directory=dl_directory)
572 | 
573 | for case in dist_data:  # just two cases CHANGE FOR FULL
574 | 
575 |     print(datetime.datetime.time(datetime.datetime.now()))
576 | 
577 |     company = case[0]
578 |     je_id = case[1]
579 |     case_num = case[2]
580 |     petition_date = case[3]
581 |     year = case[6]
582 | 
583 |     if je_id not in completed_cases:
584 | 
585 |         send_email(
586 |             email_address,
587 |             email_password,
588 |             email_address,
589 |             "New Case",
590 |             "JEID" +
591 |             str(je_id))
592 | 
593 |         if not os.path.exists(district + "/" + je_id):
594 |             os.makedirs(district + "/" + je_id)
595 | 
596 |         if not os.path.exists(
597 |                 district +
598 |                 "/" +
599 |                 je_id +
600 |                 "/" +
601 |                 je_id +
602 |                 "_data.csv"):
603 |             # for output data
604 |             with open(district + "/" + je_id + "/" + je_id + "_data.csv", 'w', encoding="utf-8") as f:
605 |                 w = csv.writer(f, delimiter=',')
606 |                 header = (
607 |                     "Company",
608 |                     "je_id",
609 |                     "petition_date",
610 |                     "casenum",
611 |                     "xdistfiled",
612 |                     "docket_text",
613 |                     "docket_number",
614 |                     "docket_date",
615 |                     "file_link",
616 |                     "[lawfirm1]",
617 |                     "[lawyers1]",
618 |                     "[lawfirm2]",
619 |                     "[lawyers2]",
620 |                     "[lawfirm3]",
621 |                     "[lawyers3]",
622 |                     "[moving party]",
623 |                     "[motion caption]")
624 |                 w.writerow(header)
625 | 
626 |         # change for each district
627 |         base_url = distlogin[district]["base_url"]
628 |         court_perl = base_url + "/cgi-bin/DktRpt.pl"
629 |         docket_rows = get_docket_rows(
630 |             driver=driver,
631 |             case_num=case_num,
632 |             year=year,
633 |             court_perl=court_perl)
634 | 
635 |         for row in docket_rows:  # just 20 rows CHANGE FOR FULL
636 |             docket_date = row[0]
637 |             docket_text = row[2].strip()
638 |             if len(
639 |                     row[1]) > 1 and len(
640 |                     row[1][0]) > 0 and row[1][0][0].isdigit():
641 |                 docket_number = row[1][0].split()[0]
642 |             else:
643 |                 with open(district + "/" + je_id + "/" + je_id + "_data.csv", 'r', encoding="utf-8") as f:
644 |                     reader = csv.reader(f)
645 |                     temp_data = list(reader)
646 |                 docket_number = temp_data[-1][-3]
647 | 
648 |             already_scraped = []
649 |             paths = []
650 |             for c in row:
651 |                 if len(c) > 1 and isinstance(c[1], dict) and len(c[1]) > 0:
652 |                     for k in c[1].keys():
653 |                         url = c[1][k]
654 |                         res = process_link(
655 |                             link_str=url,
656 |                             base_url=base_url,
657 |                             district=district,
658 |                             already_scraped=already_scraped)
659 |                         file_paths = res[0]
660 |                         if len(file_paths) > 0:
661 |                             already_scraped = res[1]
662 |                             paths.extend(file_paths)
663 | 
664 |                         # wait after each link call
665 |                         time.sleep(random.randint(1, 3))
666 | 
667 |             csv_row = [
668 |                 company,
669 |                 je_id,
670 |                 petition_date,
671 |                 case_num,
672 |                 district,
673 |                 docket_text,
674 |                 docket_number,
675 |                 docket_date,
676 |                 "; ".join(paths)]
677 |             scraped_data[district].append(csv_row)
678 | 
679 |             with open(district + "/" + je_id + "/" + je_id + "_data.csv", 'a', encoding="utf-8") as f:
680 |                 w = csv.writer(f, delimiter=',')
681 |                 w.writerow(csv_row)
682 | 
683 |         with open('completed', 'a') as f:
684 |             f.write('\n' + je_id)
685 | 
686 |         # zip and push to box
687 |         p = subprocess.Popen(['bash',
688 |                               'zip-push.sh',
689 |                               je_id],
690 |                              stdin=None,
691 |                              stdout=None,
692 |                              stderr=None,
693 |                              close_fds=True)
694 | 
695 | send_email(
696 |     email_address,
697 |     email_password,
698 |     email_address,
699 |     "Finished",
700 |     "Done scraping." +
701 |     str(je_id))
702 | print(datetime.datetime.time(datetime.datetime.now()))
703 | 


--------------------------------------------------------------------------------