├── NCDC-NOAA-wget.sh
├── GLERLNOAA-wget.sh
├── EnergyAutomatedRegister-wget.sh
├── GoogleDirectionsBuildURL-API.py
├── Kalamazoo-BS.py
├── GoogleSearch-dry.py
├── CrunchBase-API.py
├── Grimm-BS.py
├── FRMinutesDiscoutRate-BS.py
├── GLORecords-dry.py
├── GoogleGeocodeSearches.py
├── OpenSecrets-API.py
├── Perrault-BS.py
├── SFPlanning-BS.py
├── WebofScience-API.py
├── LICENSE
├── DataSFFireIncidents-API.r
├── GlassDoor-API.py
├── NationalStolenArtFile-BS.py
├── WikipediaRevisionHistory-API.py
├── INOCAR-selenium.py
├── LucidChart-BS.py
├── MRIPNOAA-selenium.py
├── .gitignore
├── Kiva-API.py
├── BGG-BS.py
├── BoardGameCapital-selenium.py
├── IMSDB-BS.py
├── NHSTrustsInfo-BS.py
├── INOCAR-AJAX.py
├── GoogleGeoLatLong-API.py
├── LARRP-BS.py
├── Wiktionary-API.py
├── ADA-ERP-BS.py
├── RioGrandeGames-selenium.py
├── STNMFSNOAA-BS.py
├── BAAD-BS.py
├── README.md
├── BSBDigitaleSammlungen-API.py
├── AHA-selenium.py
├── ResidentAdvisor-selenium.py
├── CTSNet-selenium.py
├── Doximity-selenium.py
├── RateMyProfessors-selenium.py
├── DataMartBasicSkills-req.py
└── PACER-selenium.py
/NCDC-NOAA-wget.sh:
--------------------------------------------------------------------------------
1 | wget -r ftp://data.ncdc.noaa.gov/cdr/solar-irradiance/tsi/
--------------------------------------------------------------------------------
/GLERLNOAA-wget.sh:
--------------------------------------------------------------------------------
1 | wget -r --no-parent https://www.glerl.noaa.gov//metdata/status/status_archive/
--------------------------------------------------------------------------------
/EnergyAutomatedRegister-wget.sh:
--------------------------------------------------------------------------------
1 | wget https://www.energy.gov/eere/downloads/automated-register-implemented-actions
2 | mv automated-register-implemented-actions automated-register-implemented-actions.html
3 | wget https://www.energy.gov/sites/prod/files/2016/07/f33/Automated%20Register%20V1.0.2.xlsx
4 | wget https://www.energy.gov/sites/prod/files/2016/07/f33/Automated%20Register%20V1.0.2%20User%20Manual.pdf
--------------------------------------------------------------------------------
/GoogleDirectionsBuildURL-API.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | base = "https://www.google.com/maps/dir/"
4 |
5 | locs = [
6 | "305 Harrison St, seattle, WA 98109, USA",
7 | "san francisco airport",
8 | "UC berkeley",
9 | "stanford university"]
10 |
11 | # API STUFF HERE
12 |
13 | ordered_locs = []
14 |
15 | final_url = base
16 |
17 | for l in locs:
18 | final_url += '+'.join(l.split()) + "/"
19 |
20 | print(final_url)
21 |
--------------------------------------------------------------------------------
/Kalamazoo-BS.py:
--------------------------------------------------------------------------------
1 | from urllib.request import Request, urlopen
2 | from bs4 import BeautifulSoup
3 |
4 | urls = []
5 | for i in range(1035, 1053):
6 | urls.append(
7 | "http://scholarworks.wmich.edu/cgi/viewcontent.cgi?article=" +
8 | str(i) +
9 | "&context=medieval_cong_archive")
10 |
11 | for i, url in enumerate(urls):
12 | res = urlopen(Request(url))
13 | pdf = open(("kzoo/kalamazoo_" + str(i) + ".pdf"), 'wb')
14 | pdf.write(res.read())
15 | pdf.close()
16 |
--------------------------------------------------------------------------------
/GoogleSearch-dry.py:
--------------------------------------------------------------------------------
1 | import dryscrape
2 | import sys
3 |
4 |
5 | search_term = 'testing'
6 |
7 | # set up a web scraping session
8 | sess = dryscrape.Session(base_url='http://google.com')
9 |
10 | # we don't need images
11 | sess.set_attribute('auto_load_images', False)
12 |
13 | # visit homepage and search for a term
14 | sess.visit('/')
15 | q = sess.at_xpath('//*[@name="q"]')
16 | q.set(search_term)
17 | q.form().submit()
18 |
19 | # extract all links
20 | for link in sess.xpath('//a[@href]'):
21 | print(link)
22 | print(link['href'])
23 |
24 | # # save a screenshot of the web page
25 | # sess.render('google.png')
26 | # print("Screenshot written to 'google.png'")
27 |
--------------------------------------------------------------------------------
/CrunchBase-API.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | from __future__ import division
4 | import math
5 | import csv
6 |
7 | # set key
8 | key = "PUT_KEY_HERE"
9 |
10 | # set base url
11 | base_url = "https://api.crunchbase.com/v/3/organizations"
12 |
13 | # set response format
14 | response_format = ".json"
15 |
16 | # set search parameters
17 | search_params = {"name": "uber",
18 | "user_key": key,
19 | "page": "1"}
20 |
21 | # make request
22 | r = requests.get(base_url + response_format, params=search_params)
23 | response_text = r.text
24 |
25 | # Convert JSON response to a dictionary
26 | data = json.loads(response_text)
27 |
28 | print(data.keys())
29 |
--------------------------------------------------------------------------------
/Grimm-BS.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import time
4 | import pandas as pd
5 |
6 | soup = BeautifulSoup(requests.get(
7 | "https://www.cs.cmu.edu/~spok/grimmtmp/").text, 'html5lib')
8 |
9 | titles = [x.text.strip() for x in soup.find_all("li")]
10 |
11 | base = 'https://www.cs.cmu.edu/~spok/grimmtmp/'
12 | rows = []
13 |
14 | for i in range(1, 210):
15 |
16 | url = 'https://www.cs.cmu.edu/~spok/grimmtmp/{}.txt'.format(
17 | str(i).zfill(3))
18 |
19 | text = requests.get(url).text.strip()
20 |
21 | rows.append([titles[i - 1], text])
22 |
23 | time.sleep(1)
24 |
25 | df = pd.DataFrame(rows, columns=['Title', 'Text'])
26 | df.to_csv("grimm.csv", index=False)
27 |
--------------------------------------------------------------------------------
/FRMinutesDiscoutRate-BS.py:
--------------------------------------------------------------------------------
1 | from urllib.request import Request, urlopen
2 | from bs4 import BeautifulSoup
3 |
4 |
5 | html = urlopen("http://www.federalreserve.gov/monetarypolicy/discountrate.htm")
6 | bsObj = BeautifulSoup(html.read(), "lxml")
7 | d1 = bsObj.findAll("option")
8 |
9 | urls = []
10 | for item in d1:
11 | if "PDF" in str(item.get_text()):
12 | prefix = "http://www.federalreserve.gov"
13 | url = prefix + str(item['value'])
14 | urls.append((url, str(item.get_text())))
15 |
16 | urls = urls[:3]
17 |
18 | print(len(urls))
19 |
20 | for url in urls:
21 | res = urlopen(Request(url[0]))
22 | pdf = open((url[1] + ".pdf"), 'wb')
23 | pdf.write(res.read())
24 | pdf.close()
25 |
--------------------------------------------------------------------------------
/GLORecords-dry.py:
--------------------------------------------------------------------------------
1 | import dryscrape
2 | import sys
3 | from urllib.request import Request, urlopen
4 | from bs4 import BeautifulSoup
5 | import time
6 |
7 |
8 | urls = ["http://www.glorecords.blm.gov"]
9 | ext = "/ConvertedImages/CV_Patent_0123-207.PDF"
10 |
11 | for url in urls:
12 | # set up a web scraping session
13 | sess = dryscrape.Session(base_url=url)
14 |
15 | # we don't need images
16 | sess.set_attribute('auto_load_images', True)
17 |
18 | # visit homepage and search for a term
19 | sess.visit(ext)
20 | time.sleep(15)
21 | # sess.render('sshot.png')
22 |
23 | res = urlopen(Request(url + ext))
24 | pdf = open((url[1] + ".pdf"), 'wb')
25 | pdf.write(res.read())
26 | pdf.close()
27 |
--------------------------------------------------------------------------------
/GoogleGeocodeSearches.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import urllib
3 | import time
4 |
5 | searches = ['UC Berkeley', 'University of Minnesota', 'Middlebury College']
6 |
7 | latitude = []
8 | longitude = []
9 | for s in searches:
10 | search = urllib.parse.quote(s)
11 |
12 | print(s)
13 |
14 | try:
15 | json_res = requests.get(
16 | 'https://maps.googleapis.com/maps/api/geocode/json?address={}'.format(search)).json()
17 | coordinates = json_res['results'][0]['geometry']['location']
18 | latitude.append(coordinates['lat'])
19 | longitude.append(coordinates['lng'])
20 | except:
21 | latitude.append(None)
22 | longitude.append(None)
23 |
24 | time.sleep(.5)
25 |
26 | print(list(zip(latitude, longitude)))
27 |
--------------------------------------------------------------------------------
/OpenSecrets-API.py:
--------------------------------------------------------------------------------
1 | import json
2 | from urllib.request import Request, urlopen
3 |
4 |
5 | def getJson(func, apikey, params):
6 | url = 'http://www.opensecrets.org/api/?method=%s&output=json&%s&apikey=%s' % \
7 | (func, params, apikey)
8 |
9 | req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
10 |
11 | response = urlopen(req).read().decode('utf-8')
12 | responseJson = json.loads(response)
13 |
14 | return responseJson
15 |
16 | func = "getOrgs"
17 | apikey = ""
18 | params = "org=Exxon"
19 |
20 | info = getJson(func, apikey, params)
21 |
22 | print(info)
23 |
24 | orgid = info.get("response").get("organization")[
25 | 0].get("@attributes").get("orgid")
26 |
27 | func = "orgSummary"
28 | params = "id=" + orgid
29 | summary = getJson(func, apikey, params)
30 |
31 | print(summary)
32 |
--------------------------------------------------------------------------------
/Perrault-BS.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import time
4 | import pandas as pd
5 |
6 | rows = []
7 | for i in range(1, 12):
8 |
9 | url = 'http://www.pitt.edu/~dash/perrault{}.html'.format(
10 | str(i).zfill(2))
11 |
12 | soup = BeautifulSoup(requests.get(url).text, 'html5lib')
13 |
14 | title = soup.find('h1').text.strip()
15 | text = '\n'.join([p.text for p in soup.find_all('p')[:-1]])
16 | try:
17 | text += soup.find('blockquote').text
18 | except:
19 | pass
20 |
21 | bullets = soup.find_all('li')
22 | for b in bullets:
23 | if "aarne" in b.text.lower():
24 | at = ''.join([ch for ch in b.text if ch.isnumeric()])
25 |
26 | rows.append([title, at, text])
27 |
28 | time.sleep(1)
29 |
30 | df = pd.DataFrame(rows, columns=['Title', 'Aarne-Thompson', 'Text'])
31 | df.to_csv("perrault.csv", index=False)
32 |
--------------------------------------------------------------------------------
/SFPlanning-BS.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import time
4 |
5 | all_links = []
6 | res = requests.get('http://default.sfplanning.org/meetingarchive/planning_dept/sf-planning.org/index.aspx-page=1000.html')
7 | soup = BeautifulSoup(res.text, 'lxml')
8 |
9 | # build date links
10 | base = 'http://default.sfplanning.org/meetingarchive/planning_dept/sf-planning.org/'
11 | links = [base + a['href'] for a in soup.find('div', {'id': 'ctl00_content_Screen'})('a')]
12 |
13 | # collect nested links
14 | for l in links:
15 | res = requests.get(l)
16 | soup = BeautifulSoup(res.text, 'lxml')
17 |
18 | links = [base + a['href'] for a in soup.find('div', {'id': 'ctl00_content_Screen'})('a')]
19 | all_links.extend(links)
20 | time.sleep(1)
21 |
22 | # save HTML response for all links
23 | for l in all_links:
24 | html = requests.get(l).text
25 | name = l.split('=')[-1]
26 | print(name)
27 | with open('sfplanning/' + name, 'w') as f:
28 | f.write(html)
29 | time.sleep(1)
30 |
--------------------------------------------------------------------------------
/WebofScience-API.py:
--------------------------------------------------------------------------------
1 | from wos import WosClient
2 | import wos.utils
3 | import time
4 |
5 | # must be on campus with access
6 | with WosClient('') as client:
7 | journals = ["Science"]
8 | years = range(2000, 2001)
9 | for journal in journals:
10 | for year in years:
11 |
12 | rf = wos.utils.recordsFound(
13 | client, 'PY=' + str(year) + ' AND SO=' + journal)
14 |
15 | for num in range(1, rf, 100):
16 |
17 | info = wos.utils.query(
18 | client,
19 | 'PY=' +
20 | str(year) +
21 | ' AND SO=' +
22 | journal,
23 | count=100,
24 | frecord=num)
25 |
26 | with open("data/" + str(year) + '-' + journal + ' ' + str(num) + ".xml", "w") as f:
27 | f.write(str(info.encode('utf-8')))
28 |
29 | time.sleep(2)
30 |
31 | # http://ipscience-help.thomsonreuters.com/wosWebServicesLite/WebServiceOperationsGroup/WebServiceOperations/g2/user_query/field_tags/WOSfieldTags.html
32 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Christopher Hench
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/DataSFFireIncidents-API.r:
--------------------------------------------------------------------------------
1 | ##################################################
2 | ## Project: Collect API data on fire incidents
3 | ## Author: Christopher Hench
4 | ##################################################
5 |
6 | flatten_json <- function(df) {
7 |
8 | for (col in names(df)) {
9 | if (is.list(df[[col]])) {
10 | i <- 1
11 | for (row in df[[col]]) {
12 |
13 | df[[col]][i] <- paste(row, collapse = '; ')
14 | i <- i + 1
15 | }
16 | df[[col]] <- unlist(df[[col]])
17 | }
18 | }
19 | return (df)
20 | }
21 |
22 | base_url <- 'https://data.sfgov.org/resource/wbb6-uh78.json?'
23 |
24 | incident_date <- '2017-10-22T00:00:00.000'
25 | incident_date <- URLencode(URL = incident_date, reserved = TRUE)
26 |
27 | get_request <- paste0(base_url, "incident_date=", incident_date)
28 | print(get_request)
29 |
30 | response <- httr::GET(url = get_request)
31 | response <- httr::content(x = response, as = "text")
32 | response_df <- data.frame(jsonlite::fromJSON(txt = response, simplifyDataFrame = TRUE, flatten = TRUE))
33 |
34 | flattened <- flatten_json(response_df)
35 |
36 | write.csv(flattened, file='fire-incidents.csv')
--------------------------------------------------------------------------------
/GlassDoor-API.py:
--------------------------------------------------------------------------------
1 | # https://pypi.python.org/pypi/glassdoor
2 | # http://stackoverflow.com/questions/30956891/rest-glassdoor-api-requires-user-agent-in-header
3 | import urllib.request as request
4 | import requests
5 | import json
6 | from collections import OrderedDict
7 |
8 | # authentication information & other request parameters
9 | params_gd = OrderedDict({
10 | "v": "1",
11 | "format": "json",
12 | "t.p": "",
13 | "t.k": "",
14 | "action": "employers",
15 | "employerID": "11111",
16 | # programmatically get the IP of the machine
17 | "userip": json.loads(request.urlopen("http://ip.jsontest.com/").read().decode('utf-8'))['ip'],
18 | "useragent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36"
19 | })
20 |
21 | # construct the URL from parameters
22 | basepath_gd = 'http://api.glassdoor.com/api/api.htm'
23 |
24 | # request the API
25 | response_gd = requests.get(
26 | basepath_gd, params=params_gd, headers={
27 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36"})
28 |
29 | # check the response code (should be 200) & the content
30 | response_gd
31 | data = json.loads(response_gd.text)
32 |
33 | print(data["response"]["employers"][0].keys())
34 |
--------------------------------------------------------------------------------
/NationalStolenArtFile-BS.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import time
4 | import pickle
5 | import pandas as pd
6 |
7 | headers = {
8 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
9 | artworks = []
10 | for i in range(0, 7200, 100):
11 | print(i)
12 | url = 'https://www.fbi.gov/investigate/violent-crime/art-theft/national-stolen-art-file?b_start:int=' + \
13 | str(i)
14 | res = requests.get(url, headers)
15 | soup = BeautifulSoup(res.text, 'html5lib')
16 |
17 | for i in soup.find_all('li', {'class': 'grid-item'}):
18 |
19 | art = {}
20 | art['title'] = i.find('h3').text
21 | art['description'] = i.find('p').text
22 |
23 | try:
24 | art['image_link'] = i.find('img')['src']
25 | except:
26 | art['image_link'] = 'None'
27 |
28 | keys = [x.text for x in i.find_all('b')]
29 | values = [x.text for x in i.find_all('span')]
30 |
31 | for t in list(zip(keys, values)):
32 | art[t[0]] = t[1]
33 |
34 | artworks.append(art)
35 |
36 | pickle.dump(artworks, open('artworks.pkl', 'wb'))
37 | time.sleep(5)
38 |
39 | pd.DataFrame(artworks).to_csv('artworks.csv', index=False)
40 |
--------------------------------------------------------------------------------
/WikipediaRevisionHistory-API.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import re
3 | import time
4 | import re
5 | from bs4 import BeautifulSoup
6 | import json
7 | import pickle
8 |
9 |
10 | def get_revisions(page_title, num_rev):
11 | url = "https://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions&rvprop=ids|flags|timestamp|comment|user|content|tags|flags&rvlimit=1&rvdiffto=prev&titles=" + page_title
12 | revisions = []
13 | next_request = '' # information for the next request
14 |
15 | # while True:
16 | for i in range(num_rev):
17 | response = json.loads(
18 | requests.get(
19 | url +
20 | next_request).text) # web request
21 |
22 | page_id = list(response['query']['pages'].keys())[0]
23 | revisions.append(
24 | response['query']['pages'][
25 | str(page_id)]['revisions'][0])
26 |
27 | cont = response['continue']['rvcontinue']
28 | if not cont: # break the loop if 'continue' element missing
29 | break
30 |
31 | # gets the revision Id from which to start the next request
32 | next_request = "&rvcontinue=" + cont
33 |
34 | time.sleep(1)
35 |
36 | return revisions
37 |
38 |
39 | page_names = pickle.load(open('page_names.pkl', 'rb'))
40 |
41 | for p in page_names:
42 | print(p)
43 | results = get_revisions(p, 200)
44 | pickle.dump(results, open('pickles/' + p + '.pkl', 'wb'))
45 |
--------------------------------------------------------------------------------
/INOCAR-selenium.py:
--------------------------------------------------------------------------------
1 | import time
2 | from selenium import webdriver
3 | from selenium.webdriver.common.by import By
4 | from selenium.webdriver.support.ui import WebDriverWait
5 | from selenium.webdriver.support import expected_conditions as EC
6 | from selenium.common.exceptions import TimeoutException
7 | from selenium.webdriver.support.ui import Select
8 | from bs4 import BeautifulSoup
9 |
10 |
11 | def init_driver():
12 | driver = webdriver.Chrome()
13 | driver.wait = WebDriverWait(driver, 5)
14 | return driver
15 |
16 |
17 | def lookup(driver, query):
18 | driver.get("http://www.inocar.mil.ec/mareas/pagina_mareas.php")
19 | # a = driver.wait.until(EC.presence_of_element_located((By.NAME,
20 | # "id_puerto")))
21 | driver.find_element_by_xpath(
22 | "//select[@name='id_puerto']/option[@value='378']").click()
23 | driver.find_element_by_xpath(
24 | "//select[@name='dias']/option[@value='1']").click()
25 | driver.find_element_by_xpath(
26 | "//select[@name='mes']/option[@value='1']").click()
27 | driver.find_element_by_xpath(
28 | "//select[@name='anio']/option[@value='2015']").click()
29 | driver.find_element_by_name("Submit").click()
30 |
31 | html = driver.page_source
32 | soup = BeautifulSoup(html, 'lxml')
33 | a = soup.findAll("div")
34 | print(a)
35 |
36 | if __name__ == "__main__":
37 | driver = init_driver()
38 | lookup(driver, "Selenium")
39 | time.sleep(5)
40 | driver.quit()
41 |
--------------------------------------------------------------------------------
/LucidChart-BS.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import json
3 | import csv
4 | import sys
5 |
6 | # read in html source of chart
7 | html_path = sys.argv[1]
8 | with open(html_path, "r") as f:
9 | html = f.read()
10 |
11 | soup = BeautifulSoup(html, "lxml")
12 |
13 | # find line of JSON data
14 | raw_data = str(soup)[
15 | str(soup).find("var doc = ") +
16 | len("var doc = "):str(soup).find(";\n doc.Document.state = doc.Document.state")]
17 |
18 | figure_data = json.loads(raw_data)
19 |
20 | # get states JSON
21 | states = json.loads(figure_data["Document"]['state'])
22 |
23 |
24 | def find_corr_text(thread_id, soup):
25 | '''
26 | find the text from a ThreadId
27 | '''
28 | item_id = states['Threads'][thread_id]["ItemId"]
29 | loc = str(soup).find(item_id)
30 | end = str(soup)[loc:].find("}}")
31 | raw = str(soup)[loc + len(item_id) + 3:][:end - \
32 | len(item_id) - 1].replace("\\", "")
33 |
34 | try:
35 | props = json.loads(raw)
36 | text = props["Properties"]["Text"]['t']
37 | except:
38 | return None
39 |
40 | return text
41 |
42 | # cycle through comments and add text
43 | rows = []
44 | for k in states['Comments'].keys():
45 | states['Comments'][k]['text'] = find_corr_text(
46 | states['Comments'][k]['ThreadId'], soup)
47 | rows.append(states['Comments'][k])
48 |
49 | # write csv
50 | with open('lucidchart-comments.csv', 'w') as f:
51 | w = csv.DictWriter(f, list(set(list(rows[0].keys()) + ['Type'])))
52 | w.writeheader()
53 | w.writerows(rows)
54 |
--------------------------------------------------------------------------------
/MRIPNOAA-selenium.py:
--------------------------------------------------------------------------------
1 | import re
2 | import os
3 | import requests
4 | from bs4 import BeautifulSoup
5 | from selenium import webdriver
6 | from selenium.webdriver.common.by import By
7 | from selenium.webdriver.support.ui import WebDriverWait
8 | from selenium.webdriver.support import expected_conditions as EC
9 | from urllib.request import Request, urlopen
10 | import time
11 |
12 |
13 | driver = webdriver.Chrome() # needs chromedriver in PATH
14 |
15 | # iframed into
16 | # http://www.st.nmfs.noaa.gov/recreational-fisheries/MRIP/mrip-project
17 | driver.get("https://www.st.nmfs.noaa.gov/pims/#view=public_page&program_id=1")
18 |
19 | time.sleep(15)
20 |
21 | for i in range(11):
22 |
23 | projects = []
24 |
25 | for i in driver.find_elements_by_class_name("dijitTitlePaneTextNode"):
26 | os.mkdir("MRIP/" + i.text)
27 | projects.append(i.text)
28 |
29 | content_pane = driver.find_elements_by_class_name("dijitContentPane")[0]
30 | links = content_pane.find_elements_by_class_name("docLink")
31 | if len(links) > 0:
32 | project_ct = -1
33 | for l in links:
34 | if l.text == "Proposal": # begins each new project
35 | project_ct += 1
36 | with open("MRIP/" + projects[project_ct] + "/" + "source.html", 'w') as f:
37 | f.write(str(driver.page_source))
38 |
39 | res = urlopen(Request(l.get_attribute("href")))
40 | with open("MRIP/" + projects[project_ct] + "/" + l.text + ".pdf", 'wb') as pdf:
41 | pdf.write(res.read())
42 |
43 | time.sleep(1)
44 |
45 | driver.find_element_by_id("dijit_form_Button_4_label").click()
46 | time.sleep(1)
47 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
--------------------------------------------------------------------------------
/Kiva-API.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | import time
4 |
5 | status = ["funded", "expired"]
6 |
7 | all_loans = []
8 |
9 | for s in status:
10 | for i in range(1, 2): # change range to 10000
11 | # set base url
12 | base_url = "http://api.kivaws.org/v1/loans/search"
13 |
14 | # set response format
15 | response_format = ".json"
16 |
17 | # set search parameters
18 | search_params = {"status": s,
19 | "sort_by": "newest",
20 | "page": i}
21 |
22 | # make request
23 | r = requests.get(base_url + response_format, params=search_params)
24 | time.sleep(1.1)
25 | response_text = r.text
26 |
27 | # Convert JSON response to a dictionary
28 | data = json.loads(response_text)
29 |
30 | last_date = data["loans"][-1]["posted_date"]
31 |
32 | if "2016" in last_date[:4]:
33 | for l in data["loans"]:
34 | l_id = str(l["id"])
35 |
36 | # set base url
37 | base_url = "http://api.kivaws.org/v1/loans/"
38 |
39 | # set response format
40 | response_format = ".json"
41 |
42 | # make request
43 | r = requests.get(base_url + l_id + response_format)
44 | time.sleep(1.1)
45 | response_text = r.text
46 |
47 | # Convert JSON response to a dictionary
48 | detailed_data = json.loads(response_text)
49 | final_data = detailed_data["loans"][0]
50 |
51 | r = requests.get(base_url + l_id + "/teams" + response_format)
52 | time.sleep(1.1)
53 | response_text = r.text
54 | team_data = json.loads(response_text)
55 | final_data["team_count"] = len(team_data["teams"])
56 |
57 | all_loans.append(final_data)
58 |
59 | else:
60 | break
61 |
62 | json.dump(all_loans, open("kiva_data.json", "w"))
63 |
--------------------------------------------------------------------------------
/BGG-BS.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import urllib.parse
3 | from bs4 import BeautifulSoup
4 | import requests
5 | import json
6 | import time
7 |
8 | df = pd.read_csv('game_data.csv')
9 | game_names = set([x.replace(' Rules', '') for x in df['Title']])
10 | print(len(game_names))
11 |
12 | all_dicts = []
13 | for g in game_names:
14 | game = {'Title': g}
15 |
16 | enc = urllib.parse.quote_plus(g)
17 | search_url = 'https://boardgamegeek.com/geeksearch.php?action=search&objecttype=boardgame&q={}&B1=Go'.format(
18 | enc)
19 |
20 | print(search_url)
21 |
22 | res = requests.get(search_url).text
23 | soup = BeautifulSoup(res, 'html5lib')
24 |
25 | first_result = soup.find('tr', {'id': 'row_'})
26 |
27 | try:
28 | metadata = [
29 | x.text.strip().replace(
30 | '\n',
31 | ' ').replace(
32 | '\t',
33 | '').replace(
34 | ' ',
35 | ' ') for x in first_result.find_all('td')]
36 | game['rank'], game['name'], game['geek_rating'], game[
37 | 'avg_rating'], game['voters'] = [metadata[0]] + metadata[2:-1]
38 | sub_url = 'https://boardgamegeek.com' + \
39 | first_result.find_all('td')[2].find('a')['href']
40 |
41 | for l in requests.get(sub_url).text.split('\n'):
42 | if l.strip().startswith('GEEK.geekitemPreload'):
43 | data = json.loads(l.strip()[23:-1])
44 | game = {**game, **data['item']['stats']}
45 |
46 | all_dicts.append(game)
47 | json.dump(all_dicts, open('all_dicts.json', 'w'))
48 | time.sleep(1)
49 |
50 | except:
51 | all_dicts.append(game)
52 | json.dump(all_dicts, open('all_dicts.json', 'w'))
53 | time.sleep(1)
54 |
55 | df2 = pd.DataFrame(all_dicts)
56 |
57 | match = []
58 | for t in df2['Title']:
59 | for o in df['Title']:
60 | if o.startswith(t):
61 | match.append(o)
62 | break
63 |
64 | df2['Title'] = match
65 | df.merge(df2, on=('Title')).to_csv('game_data_with_bgg.csv', index=False)
66 |
--------------------------------------------------------------------------------
/BoardGameCapital-selenium.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver # powers the browser interaction
2 | from selenium.webdriver.support.ui import Select # selects menu options
3 | from bs4 import BeautifulSoup # to parse HTML
4 | import csv # to write CSV
5 | import pandas as pd # to see CSV
6 | import time
7 | import os
8 | import random
9 | import requests
10 | import time as time_lib
11 |
12 | driver = webdriver.Chrome()
13 | next_page = "http://www.boardgamecapital.com/board-game-rules.htm"
14 | driver.get(next_page)
15 |
16 | soup = BeautifulSoup(driver.page_source, 'html5lib')
17 | game_cells = soup.find('tbody').find('tbody').find_all('td')[:-1]
18 |
19 | game_dict = {}
20 |
21 | for g in game_cells:
22 | game_dict[g.text] = {}
23 | game_dict[g.text]['link'] = 'http://www.boardgamecapital.com/' + \
24 | g.find('a')['href']
25 |
26 | for k in game_dict.keys():
27 | print(k)
28 | driver.get(game_dict[k]['link'])
29 |
30 | soup = BeautifulSoup(driver.page_source, 'html5lib')
31 |
32 | gstats1 = [x.split(':') for x in soup.find(
33 | 'div', {'class': 'gstats1'}).text.split('\n')]
34 | price = gstats1[0][1].strip()[1:]
35 | time = gstats1[1][1].strip()
36 |
37 | gstats2 = [x.split(':') for x in soup.find(
38 | 'div', {'class': 'gstats2'}).text.split('\n')]
39 | age = gstats2[0][1].strip()
40 | players = gstats2[1][1].strip()
41 |
42 | text = soup.find('div', {'class', 'mainbody'}).text
43 |
44 | pdf_links = [
45 | a for a in soup.find(
46 | 'div', {
47 | 'class', 'mainbody'}).find_all('a') if 'Game Rules' in a.text]
48 |
49 | paths = []
50 | for url in pdf_links:
51 | path = 'pdfs/{}.pdf'.format(url.text)
52 | with open(path, 'wb') as f:
53 | f.write(requests.get(url['href']).content)
54 |
55 | paths.append(path)
56 |
57 | paths = ';'.join(paths)
58 |
59 | game_dict[k]['price'] = price
60 | game_dict[k]['time'] = time
61 | game_dict[k]['age'] = age
62 | game_dict[k]['players'] = players
63 | game_dict[k]['paths'] = paths
64 | game_dict[k]['web_text'] = text
65 |
66 | time_lib.sleep(1)
67 |
--------------------------------------------------------------------------------
/IMSDB-BS.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup, NavigableString, Tag
3 | import time
4 | import urllib
5 | import pickle
6 |
7 | res = requests.get('http://www.imsdb.com/all%20scripts/').text
8 |
9 | soup = BeautifulSoup(res, 'html5lib')
10 |
11 | movies = soup.find_all('td', {'valign': 'top'})[2].find_all('p')
12 |
13 | base_url = 'http://www.imsdb.com'
14 | movie_urls = [
15 | base_url +
16 | urllib.parse.quote(
17 | m.find('a')['href']) for m in movies]
18 |
19 | all_meta = []
20 | # all_meta = pickle.load(open('meta_dicts.pkl', 'rb'))
21 | for i, url in enumerate(movie_urls[:3]):
22 | print(i)
23 | res = requests.get(url).text
24 | soup = BeautifulSoup(res, 'html5lib')
25 |
26 | script_details = soup.find('table', {'class': 'script-details'})
27 |
28 | title = script_details.find('h1').text.strip()
29 |
30 | split_details = script_details.find_all('td')[2]
31 |
32 | meta_data = {'title': title}
33 | for t in split_details.find_all('b'):
34 |
35 | sibling_data = ''
36 | for s in t.next_siblings:
37 | if isinstance(s, NavigableString):
38 | if len(str(s).strip()) > 1:
39 | sibling_data += str(s).strip()
40 | break
41 | elif isinstance(s, Tag):
42 | try:
43 | if s.name == 'a':
44 | sibling_data += s.text + ';'
45 | except:
46 | pass
47 |
48 | if s.name == 'b':
49 | break
50 |
51 | meta_data[t.text] = sibling_data
52 |
53 | all_meta.append(meta_data)
54 |
55 | if "Read" in script_details.find_all('a')[-1].text:
56 |
57 | script_link = base_url + \
58 | urllib.parse.quote(script_details.find_all('a')[-1]['href'])
59 |
60 | script_path = "scripts/" + title + '.html'
61 | with open(script_path, 'w') as f:
62 | f.write(requests.get(script_link).text)
63 |
64 | else:
65 | script_path = "NA"
66 |
67 | meta_data['script_path'] = script_path
68 |
69 | pickle.dump(all_meta, open('meta_dicts.pkl', 'wb'))
70 |
71 | time.sleep(1)
72 |
--------------------------------------------------------------------------------
/NHSTrustsInfo-BS.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import time
4 | import csv
5 |
6 |
7 | trust_url = 'https://www.nhs.uk/ServiceDirectories/Pages/NHSTrustListing.aspx'
8 | res = requests.get(trust_url)
9 | soup = BeautifulSoup(res.text, 'lxml')
10 |
11 | all_trusts = [x for x in soup('a') if x['href'].startswith('/Services/Trusts/Overview/DefaultView.aspx?id=')]
12 |
13 | all_items = []
14 | for t in all_trusts:
15 | trust_name = t.text
16 | print(trust_name)
17 | trust_site = 'https://www.nhs.uk' + t['href'].replace('Overview', 'HospitalsAndClinics')
18 | res = requests.get(trust_site)
19 | soup = BeautifulSoup(res.text, 'lxml')
20 | items = [x for x in soup.find_all('div', {'class': 'panel-content'}) if 'Address' in str(x)]
21 | for i in items:
22 | item_name = i.find('h3')
23 | if item_name:
24 | item_name = item_name.text
25 | else:
26 | continue
27 |
28 | if not i.find('a'):
29 | continue
30 |
31 | if i.find('a')['href'].startswith('/Services'):
32 | url = 'https://www.nhs.uk' + i.find('a')['href']
33 | service_type = i.find('a')['href'].split('/')[2].title()
34 | else:
35 | url = i.find('a')['href']
36 | service_type = 'Other'
37 |
38 | properties = [x.text for x in i.find('dl').find_all('dt')]
39 | values = [BeautifulSoup(str(x).replace('
', ', '), 'lxml').text for x in i.find('dl').find_all('dd')]
40 |
41 | info_dict = {'Name': item_name,
42 | 'URL': url,
43 | 'Type': service_type,
44 | 'Trust Name': trust_name}
45 | for i,k in enumerate(properties):
46 | if k in ['PostCode', 'Ext', 'Website']:
47 | continue
48 | info_dict[k.strip(':')] = values[i]
49 |
50 | all_items.append(info_dict)
51 |
52 | time.sleep(2)
53 |
54 |
55 | keys = ['Name', 'Trust Name', 'Type', 'Tel', 'Address', 'Email', 'URL']
56 | with open('nhs_sites.csv', 'w', newline='') as output_file:
57 | dict_writer = csv.DictWriter(output_file, keys)
58 | dict_writer.writeheader()
59 | dict_writer.writerows(all_items)
60 |
--------------------------------------------------------------------------------
/INOCAR-AJAX.py:
--------------------------------------------------------------------------------
1 | import json
2 | import requests
3 | from bs4 import BeautifulSoup
4 | import re
5 | import csv
6 | import time
7 | from random import randint
8 | import pickle
9 | import os.path
10 |
11 | id_dict = {"61": "San Lorenzo", "377": "Esmeraldes"}
12 | days = [str(x) for x in list(range(1, 32))]
13 | months = [str(x) for x in list(range(1, 13))]
14 | years = [str(x) for x in list(range(2003, 2016))]
15 |
16 | if os.path.isfile("already_scraped.pkl"):
17 | already_scraped = pickle.load(open("already_scraped.pkl", "rb"))
18 | else:
19 | already_scraped = []
20 |
21 | for l in id_dict.keys():
22 | for y in years:
23 | for m in months:
24 | for d in days:
25 | date = d + "/" + m + "/" + y
26 | if (l, date) not in already_scraped:
27 | payload = {
28 | "id_puerto": l,
29 | "dias": d,
30 | "mes": m,
31 | "anio": y,
32 | "task": "generate",
33 | "tipocon": "form_",
34 | "Submit": "Ver",
35 | }
36 |
37 | r = requests.post(
38 | url='http://www.inocar.mil.ec/mareas/consulta.php',
39 | data=payload
40 | )
41 |
42 | soup = BeautifulSoup(r.text, "lxml")
43 |
44 | r1 = soup.findAll("tr", {"class": "row_1"})[2:4]
45 | r2 = soup.findAll("tr", {"class": "row_2"})[2:4]
46 | rows = [tuple(r1[0].get_text().split('\n')),
47 | tuple(r2[0].get_text().split('\n')),
48 | tuple(r1[1].get_text().split('\n')),
49 | tuple(r2[1].get_text().split('\n'))]
50 |
51 | with open('data.csv', 'a') as f:
52 | a = csv.writer(f)
53 | for r in rows:
54 | row = (id_dict[l], date) + r
55 | a.writerow(row)
56 |
57 | already_scraped.append((l, date))
58 | pickle.dump(
59 | already_scraped, open(
60 | "already_scraped.pkl", "wb"))
61 | time.sleep(randint(1, 3))
62 |
--------------------------------------------------------------------------------
/GoogleGeoLatLong-API.py:
--------------------------------------------------------------------------------
1 | import json
2 | from urllib.request import Request, urlopen
3 | import time
4 | import csv
5 |
6 |
7 | def getJson(lat, longi):
8 | url = 'http://maps.googleapis.com/maps/api/geocode/json?latlng=%s,%s&sensor=true' % \
9 | (lat, longi)
10 |
11 | req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
12 |
13 | response = urlopen(req).read().decode('utf-8')
14 | responseJson = json.loads(response)['results']
15 |
16 | return responseJson
17 |
18 | latlong = [(18.6, -
19 | 100.566667), (19.6, -
20 | 100.566667), (19.6, -
21 | 101.566667), (17.6, -
22 | 100.566667), (27.121381, -
23 | 107.200644), (37.586630, -
24 | 123.233372), (25.267348, -
25 | 120.087235), (19.6, -
26 | 96.566667), (17.6, -
27 | 98.566667), (37.882042, -
28 | 122.277562)]
29 |
30 | municps = []
31 | for coord in latlong:
32 | switch = 0
33 | info = getJson(coord[0], coord[1])
34 | # municps.append(info.get("results")[1].get("address_components")[0].get("long_name"))
35 | # #if certain data is there
36 | for result in info: # to avoid errors if incorrect data
37 | for address_component in result['address_components']:
38 | if address_component['types'] == [
39 | "administrative_area_level_2", "political"]:
40 | municps.append(address_component['long_name'])
41 | switch = 1
42 | break
43 | break
44 |
45 | if switch == 1:
46 | continue
47 | else:
48 | municps.append("None")
49 |
50 | time.sleep(.11)
51 |
52 |
53 | latlongname = list(zip(latlong, municps))
54 |
55 | with open('data.csv', 'w') as out:
56 | csv_out = csv.writer(out)
57 | csv_out.writerow(['lat-long', 'name'])
58 | for row in latlongname:
59 | csv_out.writerow(row)
60 |
--------------------------------------------------------------------------------
/LARRP-BS.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup # to parse HTML
2 | import csv # to write CSV
3 | import pandas as pd # to see CSV
4 | import time
5 | import os
6 | import random
7 | import requests
8 |
9 |
10 | def dl_pages(base_url, books, pres):
11 | os.mkdir(pres)
12 | for i1, b in enumerate(books):
13 | next_page = b
14 | res = requests.get(next_page).text
15 | soup = BeautifulSoup(res, 'html5lib')
16 | book_title = soup.find('h3').text
17 |
18 | os.mkdir(pres + '/' + book_title + '-' + str(i1))
19 |
20 | try:
21 | for i in range(1, 10000):
22 | res = requests.get(next_page).text
23 |
24 | soup = BeautifulSoup(res, 'html5lib')
25 |
26 | if 'Discurso al proclamarse su candidatura' in book_title:
27 | next_page = base_url + \
28 | soup.find('center').find_all('a')[1]['href']
29 | else:
30 | next_page = base_url + \
31 | soup.find('center').find_all('a')[2]['href']
32 |
33 | tif_link = base_url + \
34 | [x['href'] for x in soup.find_all('a') if 'tif' in x['href']][0]
35 |
36 | res = requests.get(tif_link).content
37 |
38 | with open(pres + '/' + book_title + '-' + str(i1) + '/page-' + str(i) + '.tif', 'wb') as f:
39 | f.write(res)
40 |
41 | time.sleep(1)
42 | except:
43 | continue
44 |
45 |
46 | books = [
47 | 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/180002t.html',
48 | 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/190117t.html',
49 | 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/200253t.html',
50 | 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/210286t.html',
51 | 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/170347.html']
52 |
53 | base_url = 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/'
54 |
55 | dl_pages(base_url, books, 'yrigoyen')
56 |
57 |
58 | res = requests.get(
59 | 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/peron/index.html').text
60 |
61 | soup = BeautifulSoup(res, 'html5lib')
62 |
63 | books = []
64 | base_url = 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/peron/'
65 | for li in soup.find('ul').find_all('li'):
66 | link = [x for x in li.find_all('a') if 'idx' not in x['href']][0]
67 |
68 | if not link.text.strip().startswith('I'):
69 | books.append(base_url + link['href'])
70 |
71 |
72 | dl_pages(base_url, books, 'peron')
73 |
--------------------------------------------------------------------------------
/Wiktionary-API.py:
--------------------------------------------------------------------------------
1 | '''This script scrapes wiktionary to get MHG lemmas of NHG lemmas.'''
2 |
3 | from bs4 import BeautifulSoup
4 | from urllib.request import Request, urlopen
5 | import time
6 | from string import punctuation
7 | import urllib.parse
8 | import treetaggerwrapper
9 | import json
10 | import time
11 | from random import randint
12 | import os
13 | import pyprind
14 |
15 |
16 | # get words from freq list and translations
17 | with open("top10000.txt", "r") as f:
18 | words = f.read().split()
19 |
20 | with open("NHG.txt", "r") as f:
21 | more_words = f.read().split()
22 |
23 | all_words = set(words + more_words)
24 |
25 | # turn words to set of lemmas
26 | tagger = treetaggerwrapper.TreeTagger(TAGLANG='de')
27 |
28 | lemmas = []
29 | for w in all_words:
30 | lemm = tagger.tag_text(w)[0].split("\t")[-1]
31 | lemmas.append(lemm)
32 |
33 | lemmas = set(lemmas)
34 |
35 | # start scraping here
36 | base = "https://de.wiktionary.org/w/api.php?format=xml&action=query&titles="
37 | branch = "&rvprop=content&prop=revisions&redirects=1"
38 |
39 | if os.path.isfile("cognate_dict.json"):
40 | cognate_dict = json.load(open("cognate_dict.json", "r"))
41 | else:
42 | cognate_dict = {}
43 |
44 | bar = pyprind.ProgBar(len(lemmas), monitor=True, bar_char="#")
45 | for w in lemmas:
46 |
47 | if w not in cognate_dict:
48 |
49 | # for UTF-8 URL parsing
50 | url = base + w + branch
51 | url_word = urllib.parse.quote(w)
52 | url = base + url_word + branch
53 |
54 | html = urlopen(url)
55 | bsObj = BeautifulSoup(html.read(), "lxml")
56 | text = bsObj.get_text()
57 |
58 | if "mittelhochdeutsch" in text:
59 | ind = text.index("mittelhochdeutsch")
60 | cognates = text[ind:].split("''")
61 |
62 | if len(cognates) > 1:
63 | cognates = cognates[1].split()
64 | for i, c in enumerate(cognates):
65 | if "|" in c:
66 | cognates[i] = c.split("|")[-1]
67 |
68 | for char in punctuation:
69 | cognates = [c.replace(char, "") for c in cognates]
70 |
71 | cognates = [c for c in cognates if len(c) > 0 and c[
72 | 0].isalpha()]
73 |
74 | cognate_dict[w] = cognates
75 |
76 | with open("cognate_dict.json", "w") as f:
77 | json.dump(cognate_dict, f)
78 |
79 | time.sleep(randint(1, 3))
80 |
81 | else:
82 | cognate_dict[w] = None
83 |
84 | with open("cognate_dict.json", "w") as f:
85 | json.dump(cognate_dict, f)
86 |
87 | else:
88 |
89 | cognate_dict[w] = None
90 |
91 | with open("cognate_dict.json", "w") as f:
92 | json.dump(cognate_dict, f)
93 |
94 | bar.update()
95 |
96 | print("Done!")
97 |
--------------------------------------------------------------------------------
/ADA-ERP-BS.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import re
4 | import time
5 | import pickle
6 | import csv
7 |
8 |
9 | def get_pages(soup):
10 | '''
11 | gets links to any subsequent pages
12 | '''
13 | base = 'https://professional.diabetes.org'
14 | try:
15 | page_links = soup.find('ul', {'class': 'pagination'}).find_all('a')
16 | links = [base + a['href'] for a in page_links]
17 | return set(links)
18 | except:
19 | return None
20 |
21 |
22 | def get_org_dicts(soup):
23 | '''
24 | turn any listed organizations on page to dictionaries
25 | '''
26 |
27 | orgs = soup.find_all('div', {'class': 'col col-sm-4'})
28 |
29 | org_dicts = []
30 |
31 | for o in orgs:
32 | meta = o.find_all('div')
33 | org_dict = {}
34 |
35 | # up to colon is key after is value
36 | pattern = re.compile('(.*?):(.*)')
37 | for m in meta:
38 | try:
39 | groups = re.search(pattern, m.text).groups()
40 | title = groups[0].strip()
41 | value = groups[1].strip()
42 | org_dict[title] = value
43 | except:
44 | pass
45 |
46 | org_dicts.append(org_dict)
47 |
48 | return org_dicts
49 |
50 |
51 | if __name__ == "__main__":
52 | # get list of states from sample URL
53 | init = 'https://professional.diabetes.org/erp_list?field_erp_state_value=NY'
54 | res = requests.get(init)
55 | soup = BeautifulSoup(res.text, 'html5lib')
56 | options = soup.find(
57 | 'select', {'id': 'edit-field-erp-state-value'}).find_all('option')
58 | states = [x['value'] for x in options]
59 |
60 | # start iteration through state URLS
61 | all_dicts = []
62 | for s in states:
63 | print(s)
64 | state_link = 'https://professional.diabetes.org/erp_list?field_erp_state_value={}'.format(
65 | s)
66 | res = requests.get(state_link)
67 | soup = BeautifulSoup(res.text, 'html5lib')
68 |
69 | # get dicts
70 | all_dicts.extend(get_org_dicts(soup))
71 | pickle.dump(all_dicts, open('all-dicts.pkl', 'wb'))
72 |
73 | # get extra pages
74 | pages = get_pages(soup)
75 |
76 | # cycle through subsequent pages
77 | if pages != None:
78 | for p in pages:
79 | res = requests.get(p)
80 | soup = BeautifulSoup(res.text, 'html5lib')
81 | all_dicts.extend(get_org_dicts(soup))
82 | time.sleep(1)
83 | pickle.dump(all_dicts, open('all-dicts.pkl', 'wb'))
84 | time.sleep(1)
85 |
86 | # dump csv
87 | with open('erp.csv', 'w') as csvfile:
88 | fieldnames = list(all_dicts[0].keys())
89 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
90 | writer.writeheader()
91 | writer.writerows(all_dicts)
92 |
--------------------------------------------------------------------------------
/RioGrandeGames-selenium.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver # powers the browser interaction
2 | from selenium.webdriver.support.ui import Select # selects menu options
3 | from bs4 import BeautifulSoup # to parse HTML
4 | import csv # to write CSV
5 | import pandas as pd # to see CSV
6 | import time
7 | import os
8 | import random
9 | import requests
10 | import pickle
11 |
12 |
13 | driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
14 | driver.get('http://riograndegames.com/search.html?category%5B%5D=5&category%5B%5D=10&category%5B%5D=14&category%5B%5D=1&category%5B%5D=2&category%5B%5D=12&category%5B%5D=3&category%5B%5D=6&category%5B%5D=8&category%5B%5D=9&category%5B%5D=4&category%5B%5D=13&category%5B%5D=22&category%5B%5D=16&category%5B%5D=11&category%5B%5D=7&category%5B%5D=17&category%5B%5D=18&category%5B%5D=15&language=0&min_players=0&length=0&min_age=0&term=')
15 | search_results = driver.find_element_by_css_selector(
16 | 'div#search_results.isotope').find_elements_by_css_selector('div.search_item.isotope-item')
17 |
18 | games_dicts = []
19 | attributes = [
20 | 'data-title',
21 | 'data-orig',
22 | 'data-length',
23 | 'data-date',
24 | 'data-age',
25 | 'data-players',
26 | 'data-msrp']
27 |
28 | for s in search_results:
29 | game = {}
30 | for a in attributes:
31 | game[a] = s.get_attribute(a)
32 |
33 | game['page_link'] = s.find_element_by_css_selector(
34 | 'a').get_attribute('href')
35 |
36 | games_dicts.append(game)
37 |
38 |
39 | final_games_dicts = []
40 | for g in games_dicts:
41 | print(g['data-title'])
42 | driver.get(g['page_link'])
43 | cats = driver.find_elements_by_css_selector('span.game_cat')
44 | cats = [c.text.replace(',', '') for c in cats]
45 | g['game_category'] = ';'.join(cats)
46 |
47 | # unfold and download
48 | driver.find_element_by_css_selector('span.button2').click()
49 |
50 | asset_links = driver.find_elements_by_css_selector('p.asset_list a')
51 |
52 | for a in asset_links:
53 | images = a.find_elements_by_css_selector("img")
54 | for i in images:
55 | if "rules" in i.get_attribute('title').lower():
56 | download = a.get_attribute('href')
57 | session = requests.Session()
58 | cookies = driver.get_cookies()
59 |
60 | for cookie in cookies:
61 | session.cookies.set(cookie['name'], cookie['value'])
62 | response = session.get(download)
63 |
64 | dl_path = 'pdfs/' + g['data-title'] + '.pdf'
65 |
66 | with open(dl_path, 'wb') as f:
67 | f.write(response.content)
68 |
69 | g['pdf_path'] = dl_path
70 | final_games_dicts.append(g)
71 | pickle.dump(final_games_dicts, open('game_dicts.pkl', 'wb'))
72 |
73 | time.sleep(1)
74 | break
75 | break
76 |
77 | time.sleep(1)
78 |
--------------------------------------------------------------------------------
/STNMFSNOAA-BS.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import os
4 | import time
5 |
6 | # send payload to get list of species
7 | payload = {'qwhocalled': 'monthly',
8 | 'qcommon': '',
9 | 'qreturn': 'Search',
10 | 'qselect': 'List Empty, Do a Search to Fill'}
11 | r = requests.get(
12 | 'https://www.st.nmfs.noaa.gov/pls/webpls/FT_HELP.SPECIES',
13 | params=payload)
14 |
15 | soup = BeautifulSoup(r.content, "lxml")
16 | species = [x.text for x in soup.findAll("option")]
17 |
18 | # iterate through species
19 | for sp in species:
20 |
21 | if not os.path.exists(sp.replace(",", "").replace(
22 | " ", "-").replace("/", "_")): # if need to restart script
23 |
24 | # make directory for species
25 | os.mkdir(sp.replace(",", "").replace(" ", "-").replace("/", "_"))
26 |
27 | # send payload to get different states and regions
28 | payload = {'qwhocalled': 'monthly',
29 | 'qcommon': '',
30 | 'qreturn': 'Return',
31 | 'qselect': sp}
32 | r = requests.get(
33 | 'https://www.st.nmfs.noaa.gov/pls/webpls/FT_HELP.SPECIES',
34 | params=payload)
35 |
36 | soup = BeautifulSoup(r.content, "lxml")
37 | states = [
38 | x.text for x in soup.find(
39 | "select", {
40 | "name": "qstate"}).findAll("option")]
41 |
42 | # iterate through different regions and states
43 | for st in states:
44 |
45 | payload = {'qspecies': sp,
46 | 'qreturn': 'Species Locator',
47 | 'qyearfrom': '1990',
48 | 'qyearto': '2015',
49 | 'qmonth': 'YEAR BY MONTH',
50 | 'qstate': st,
51 | 'qoutput_type': 'TABLE'}
52 | r = requests.get(
53 | 'http://www.st.nmfs.noaa.gov/pls/webpls/MF_MONTHLY_LANDINGS.RESULTS',
54 | params=payload)
55 |
56 | # save html tables into folders
57 | with open(sp.replace(",", "").replace(" ", "-").replace("/", "_") + "/" + st + ".html", "w") as f:
58 | f.write(str(r.content))
59 |
60 | # don't overload server
61 | time.sleep(.1)
62 |
63 | # get all species from main page
64 | os.mkdir('ALL-SPECIES-COMBINED')
65 |
66 | # iterate through different states and regions
67 | for st in states:
68 |
69 | payload = {'qspecies': 'ALL SPECIES COMBINED',
70 | 'qreturn': 'Species Locator',
71 | 'qyearfrom': '1990',
72 | 'qyearto': '2015',
73 | 'qmonth': 'YEAR BY MONTH',
74 | 'qstate': st,
75 | 'qoutput_type': 'TABLE'}
76 |
77 | r = requests.get(
78 | 'https://www.st.nmfs.noaa.gov/pls/webpls/MF_MONTHLY_LANDINGS.RESULTS',
79 | params=payload)
80 |
81 | with open('ALL-SPECIES-COMBINED' + "/" + st + ".html", "w") as f:
82 | f.write(str(r.content))
83 |
--------------------------------------------------------------------------------
/BAAD-BS.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup # to parse HTML
2 | import csv # to write CSV
3 | import pandas as pd # to see CSV
4 | import time
5 | import os
6 | import random
7 | import requests
8 |
9 | next_page = 'http://www.start.umd.edu/baad/database'
10 | base_url = 'http://www.start.umd.edu'
11 |
12 | all_rows = []
13 | all_rows.append(['ID',
14 | 'Group Name',
15 | 'Country',
16 | 'Lethality',
17 | 'Number of Allies',
18 | 'Number of Rivals',
19 | 'Founded',
20 | 'Fatalities',
21 | 'Fatality Years',
22 | 'Ideologies',
23 | 'Strength',
24 | 'Territorial Control',
25 | 'Funding through Drug Trafficking',
26 | 'Record Year'])
27 |
28 | for i in range(1, 6):
29 | res = requests.get(next_page).text
30 |
31 | soup = BeautifulSoup(res, 'html5lib')
32 |
33 | rows = soup.find('table', {'class', 'sticky-enabled'}).find_all('tr')
34 | rows = rows[1:]
35 |
36 | for r in rows:
37 | cells = r.find_all('td')
38 | cell_text = [x.text.strip() for x in cells]
39 | link = base_url + cells[0].find('a')['href']
40 |
41 | res = requests.get(link).text
42 | soup = BeautifulSoup(res, 'html5lib')
43 |
44 | year_bullets = soup.find('div', {'class': 'item-list'}).find_all('li')
45 | year_urls = [(base_url + x.find('a')['href'],
46 | x.find('a').text.strip()) for x in year_bullets]
47 | for u in year_urls:
48 | record_year = u[1]
49 | res = requests.get(u[0]).text
50 | soup = BeautifulSoup(res, 'html5lib')
51 |
52 | founded = soup.find(
53 | 'div', {'class', 'quick-view-founded'}).text.split(':')[-1].strip()
54 | fatalities, fatality_years = soup.find(
55 | 'div', {'class', 'quick-view-lethality'}).text.split(':')[-1].strip().split(' ', maxsplit=1)
56 | ideology = soup.find(
57 | 'div', {'class', 'quick-view-ideology'}).text.split(':')[-1].strip()
58 | strength = soup.find(
59 | 'div', {'class', 'quick-view-strength'}).text.split(':')[-1].strip()
60 | terrcnt = soup.find(
61 | 'div', {'class', 'quick-view-terrcnt'}).text.split(':')[-1].strip()
62 | drugs = soup.find(
63 | 'div', {'class', 'quick-view-drug-funding'}).text.split(':')[-1].strip()
64 |
65 | data_row = [
66 | cell_text[0] + '-' + record_year] + cell_text + [
67 | founded,
68 | fatalities,
69 | fatality_years,
70 | ideology,
71 | strength,
72 | terrcnt,
73 | drugs,
74 | record_year]
75 | print(data_row)
76 | all_rows.append(data_row)
77 |
78 | time.sleep(1)
79 |
80 | time.sleep(1)
81 |
82 | next_page = 'http://www.start.umd.edu/baad/database?page={}'.format(str(i))
83 | time.sleep(1)
84 |
85 |
86 | with open("baad.csv", "w") as f:
87 | csv_w = csv.writer(f)
88 | csv_w.writerows(all_rows)
89 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # web-scrapers
2 |
3 | Various web scrapers for research and fun:
4 |
5 | - [Board Game Capital](http://www.boardgamecapital.com/board-game-rules.htm)
6 | - [CTS Net](https://www.ctsnet.org/)
7 | - [Minutes of the Federal Reserve Board of Governors discount rate](https://www.federalreserve.gov/monetarypolicy/discountrate.htm)
8 | - [Doximity](https://www.doximity.com/)
9 | - [Energy - The Automated Register of Implemented Actions](https://www.energy.gov/eere/downloads/automated-register-implemented-actions)
10 | - [Lucid Chart](https://www.lucidchart.com/)
11 | - [GLERL NOAA](https://www.glerl.noaa.gov//metdata/status/status_archive/)
12 | - [American Historical Association](http://careers.historians.org/jobs/)
13 | - [Grimm Fairy Tales](https://www.cs.cmu.edu/~spok/grimmtmp/)
14 | - [Perrault Fairy Tales](http://www.pitt.edu/~dash/perrault.html)
15 | - [IMSDB](http://www.imsdb.com/all%20scripts/)
16 | - [Glass Door API](https://www.glassdoor.com/index.htm)
17 | - [Crunch Base API](https://data.crunchbase.com/docs)
18 | - [Google Directions](https://www.google.com/maps/dir/)
19 | - [INOCAR](http://www.inocar.mil.ec/web/index.php)
20 | - [Kalamazoo](http://scholarworks.wmich.edu/)
21 | - [Kiva API](https://www.kiva.org/)
22 | - [Google Geocoding API](https://developers.google.com/maps/documentation/geocoding/start)
23 | - [Google Search](https://www.google.com/)
24 | - [GLO Records](https://glorecords.blm.gov/default.aspx)
25 | - [MRIP NOAA](http://www.st.nmfs.noaa.gov/recreational-fisheries/MRIP/mrip-project)
26 | - [Web of Science](http://ipscience-help.thomsonreuters.com/LAMRService/WebServiceOperationsGroup/requestAPIWoS.html)
27 | - [ST NMFS NOAA](https://www.st.nmfs.noaa.gov/)
28 | - [NCDC NOAA](https://www.ncdc.noaa.gov/cdr/atmospheric/total-solar-irradiance)
29 | - [Open Secrets](https://www.opensecrets.org/resources/create/apis.php)
30 | - [Resident Advisor](https://www.residentadvisor.net/reviews.aspx?format=single)
31 | - [Rate My Professors](http://www.ratemyprofessors.com/)
32 | - [LARRP](http://lanic.utexas.edu/larrp/pm/sample2/)
33 | - [Wiktionary](https://de.wiktionary.org/)
34 | - [Wikipedia Revision History API](https://www.mediawiki.org/wiki/API:Revisions)
35 | - [Big, Allied and Dangerous BAAD](http://www.start.umd.edu/baad/database)
36 | - [Rio Grande Games](http://riograndegames.com/)
37 | - [Bayerische Staatsbibliothek](https://opacplus.bsb-muenchen.de/)
38 | - [DataSF Fire Incidents API](https://data.sfgov.org/Public-Safety/Fire-Incidents/wr8u-xric)
39 | - [Google Geocoding API Searches](https://developers.google.com/maps/documentation/geocoding/start)
40 | - [Board Game Geek](https://boardgamegeek.com/)
41 | - [Data Mart Basic Skills](http://datamart.cccco.edu/Outcomes/BasicSkills_Cohort_Tracker.aspx)
42 | - [Public Access to Court Electronic Records (PACER)](https://www.pacer.gov/)
43 | - [SF Planning Commission Minutes](http://default.sfplanning.org/meetingarchive/planning_dept/sf-planning.org/index.aspx-page=1000.html)
44 | - [American Diabetes Association ERP Resources](https://professional.diabetes.org/erp_list?field_erp_state_value=NY)
45 | - [National Stolen Art File](https://www.fbi.gov/investigate/violent-crime/art-theft/national-stolen-art-file)
46 | - [NHS Trusts](https://www.nhs.uk/ServiceDirectories/Pages/NHSTrustListing.aspx)
47 |
--------------------------------------------------------------------------------
/BSBDigitaleSammlungen-API.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver # powers the browser interaction
2 | from selenium.webdriver.support.ui import Select # selects menu options
3 | from bs4 import BeautifulSoup # to parse HTML
4 | import csv # to write CSV
5 | import pandas as pd # to see CSV
6 | import time
7 | import os
8 | import random
9 | import requests
10 | import re
11 | import pickle
12 | import numpy as np
13 |
14 |
15 | # PART 1
16 | # first collect bsb ids from search of years 700-1400
17 | driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
18 |
19 | driver.maximize_window()
20 |
21 | driver.get("https://opacplus.bsb-muenchen.de/metaopac/start.do")
22 | driver.find_element_by_css_selector(
23 | 'input#searchRestrictionValue1_2.form-control').send_keys('700')
24 | driver.find_element_by_css_selector(
25 | 'input#searchRestrictionValue2_2.form-control').send_keys('1400')
26 | driver.find_element_by_css_selector(
27 | 'input#submitSearch.btn.btn-default.dbuttonb').click()
28 | driver.find_element_by_css_selector(
29 | '#availableFacets > ul > li:nth-child(4) > ul > li:nth-child(5) > a > span.hidden-xs').click()
30 |
31 | time.sleep(5)
32 |
33 | print(driver.find_element_by_css_selector(
34 | '#speed_result_list_100 > div > div.nav.nav-tabs.box-header.navigation > div.col-xs-9.col-md-5 > h2').text)
35 |
36 | bsbs = []
37 | pattern = r'bsb[0-9]+'
38 |
39 | for i in range(2000):
40 |
41 | print(i)
42 |
43 | soup = BeautifulSoup(driver.page_source, 'html5lib')
44 |
45 | rows = soup.find_all('td', {'class': 'resultscell'})
46 |
47 | for r in rows:
48 | links = r.find_all('a')
49 | for l in links:
50 | if re.search(pattern, l['href']):
51 | bsbs.append(re.search(pattern, l['href']).group())
52 |
53 | pickle.dump(bsbs, open('bsbs.pkl', 'wb'))
54 |
55 | driver.find_element_by_css_selector(
56 | '#speed_result_list_100 > div > div.nav.nav-tabs.box-header.navigation > div.hidden-xs.hidden-sm.col-xs-7.col-md-7.pull-right.pagination > div > ul > li:nth-child(8) > a').click()
57 | time.sleep(5)
58 |
59 |
60 | # PART 2
61 | # now read in list of bsb ids and collect API data
62 |
63 |
64 | def get_dimensions(res):
65 |
66 | width = []
67 | height = []
68 | for p in res['sequences'][0]['canvases']:
69 | try:
70 | scale = p['service']['physicalScale']
71 | width.append(p['width'] * scale)
72 | height.append(p['height'] * scale)
73 | except:
74 | pass
75 |
76 | return (np.mean(height), np.mean(width))
77 |
78 | bsbs = pickle.load(open('bsbs.pkl', 'rb'))
79 | data_dicts = []
80 |
81 | for bsb in bsbs:
82 | print(bsb)
83 |
84 | try:
85 | res = requests.get(
86 | 'https://api.digitale-sammlungen.de/iiif/presentation/v2/{}/manifest'.format(bsb)).json()
87 | hs_dict = {}
88 | hs_dict['Thumbnail'] = res['thumbnail']['@id']
89 | hs_dict['Label'] = res['label']
90 |
91 | for m in res['metadata']:
92 | key = m['label'][1]['@value']
93 | value = m['value']
94 |
95 | if isinstance(value, list):
96 | value = value[-1]['@value']
97 |
98 | hs_dict[key.strip()] = value.strip()
99 |
100 | hs_dict['Height'], hs_dict['Width'] = get_dimensions(res)
101 |
102 | data_dicts.append(hs_dict)
103 | pickle.dump(data_dicts, open('data_dicts.pkl', 'wb'))
104 |
105 | except:
106 | pass
107 |
108 | time.sleep(3)
109 |
--------------------------------------------------------------------------------
/AHA-selenium.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver # powers the browser interaction
2 | from selenium.webdriver.support.ui import Select # selects menu options
3 | from bs4 import BeautifulSoup # to parse HTML
4 | import csv # to write CSV
5 | import pandas as pd # to see CSV
6 | import time
7 | import os
8 | import random
9 | import requests
10 |
11 | driver = webdriver.Chrome()
12 | driver.get("http://careers.historians.org/jobs/?page=1")
13 |
14 | base_url = 'http://careers.historians.org'
15 | all_rows = []
16 | pages = ["http://careers.historians.org/jobs/?page=1",
17 | "http://careers.historians.org/jobs/?page=2"]
18 |
19 | for p in pages:
20 | driver.get(p)
21 | soup = BeautifulSoup(driver.page_source, 'html5lib')
22 |
23 | rows = soup.find_all('div', {'class': 'bti-ui-job-detail-container'})
24 | for r in rows:
25 | title = r.find('a').text.strip()
26 | link = base_url + r.find('a')['href']
27 | employer = r.find(
28 | 'div', {
29 | 'class': 'bti-ui-job-result-detail-employer'}).text.strip()
30 | location = r.find(
31 | 'div', {
32 | 'class': 'bti-ui-job-result-detail-location'}).text.strip()
33 | date_posted = r.find(
34 | 'div', {
35 | 'class': 'bti-ui-job-result-detail-age'}).text.strip()
36 |
37 | driver.get(link)
38 |
39 | soup = BeautifulSoup(driver.page_source, 'html5lib')
40 |
41 | try:
42 | job_description = soup.find(
43 | 'div', {'class': 'bti-jd-description'}).text.strip()
44 |
45 | details = soup.find('div', {'class': 'bti-jd-details-container'})
46 |
47 | details_titles = [
48 | x.text.replace(
49 | ':', '').lower().strip() for x in details.find_all(
50 | 'div', {
51 | 'class': 'bti-jd-detail-title'})]
52 | details_text = [
53 | x.text.strip() for x in details.find_all(
54 | 'div', {
55 | 'class': 'bti-jd-detail-text'})]
56 |
57 | details_dict = {}
58 |
59 | for i in range(len(details_titles)):
60 | t = details_titles[i]
61 | if 'categories' in t:
62 | t = 'category'
63 | elif 'required' in t:
64 | t = 'preferred education'
65 | details_dict[t] = details_text[i]
66 |
67 | details_dict['title'] = title
68 | details_dict['link'] = link
69 | details_dict['employer'] = employer
70 | details_dict['location'] = location
71 | details_dict['date_posted'] = date_posted
72 | details_dict['job_description'] = job_description
73 |
74 | try:
75 | details_dict['employer_about'] = soup.find(
76 | 'div', {'class': 'bti-jd-employer-info'}).text.strip()
77 | except:
78 | details_dict['employer_about'] = ''
79 |
80 | all_rows.append(details_dict)
81 |
82 | except:
83 | pass
84 |
85 | time.sleep(1)
86 |
87 | header = ["title",
88 | "employer",
89 | "location",
90 | "posted",
91 | "date_posted",
92 | "primary field",
93 | "category",
94 | "preferred education",
95 | "salary",
96 | "type",
97 | "employment type",
98 | "job_description",
99 | "employer_about",
100 | "link"
101 | ]
102 |
103 |
104 | with open('AHA-data.csv', 'w') as f:
105 | w = csv.DictWriter(f, header)
106 | w.writeheader()
107 | w.writerows(all_rows)
108 |
--------------------------------------------------------------------------------
/ResidentAdvisor-selenium.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver # powers the browser interaction
2 | from selenium.webdriver.support.ui import Select # selects menu options
3 | from bs4 import BeautifulSoup # to parse HTML
4 | import csv # to write CSV
5 | import pandas as pd # to see CSV
6 | import time
7 | import os
8 | import random
9 |
10 | driver = webdriver.PhantomJS()
11 | next_page = "https://www.residentadvisor.net/reviews.aspx?format=single"
12 |
13 | with open("resident-adv.csv", "a") as f:
14 | csv_w_interv = csv.writer(f)
15 | csv_w_interv.writerow(["title",
16 | "artist",
17 | "single",
18 | "label",
19 | "record",
20 | "style",
21 | "reviewed_date",
22 | "release_date",
23 | "comments",
24 | "rating",
25 | "description",
26 | "URL"])
27 |
28 |
29 | for i in range(10000):
30 |
31 | driver.get(next_page)
32 |
33 | soup = BeautifulSoup(driver.page_source, "html5lib")
34 |
35 | try:
36 | next_page = "https://www.residentadvisor.net/" + \
37 | soup.find("li", {"class": "but arrow-left bbox"}).find("a")['href']
38 | except:
39 | next_page = ""
40 |
41 | singles = soup.find(
42 | "div", {
43 | "id": "reviews"}).find_all(
44 | "article", {
45 | "class": "highlight-top"})
46 |
47 | review_links = [
48 | 'https://www.residentadvisor.net' +
49 | x.find("a")['href'] for x in singles]
50 |
51 | if i == 0:
52 | review_links = review_links[25:]
53 |
54 | for l in review_links:
55 | driver.get(l)
56 |
57 | soup = BeautifulSoup(driver.page_source, 'html5lib')
58 |
59 | title = soup.find("div", {"id": "sectionHead"}).find("h1").text.strip()
60 |
61 | try:
62 | artist = title.split("-")[0].strip()
63 |
64 | single = title.split("-")[1].strip()
65 | except:
66 | artist = ''
67 | single = ''
68 |
69 | print(title)
70 |
71 | rating = soup.find("span", {"class": "rating"}).text.split("/")[0]
72 | reviewed_date = soup.find("span", {"itemprop": "dtreviewed"})[
73 | 'datetime'].strip()
74 |
75 | meta_list = soup.find("ul", {"class": "clearfix"}).find_all("li")
76 |
77 | style = meta_list[2].text.split('\n')[4]
78 | label = str(meta_list[0]).split(
79 | '
')[0].split('">')[-1].split('')[0].strip()
80 | record = str(meta_list[0]).split('
')[-1].split("")[0].strip()
81 | release_date = meta_list[1].text.split('\n')[4]
82 | comments = meta_list[3].text.split('\n')[4].split("/")[0].strip()
83 | description = soup.find("span",
84 | {"itemprop": "description"}).text.strip()
85 |
86 | with open("resident-adv.csv", "a") as f:
87 | csv_w_interv = csv.writer(f)
88 | csv_w_interv.writerow([title,
89 | artist,
90 | single,
91 | label,
92 | record,
93 | style,
94 | reviewed_date,
95 | release_date,
96 | comments,
97 | rating,
98 | description,
99 | l])
100 |
101 | time.sleep(random.randint(1, 3))
102 |
103 | time.sleep(random.randint(1, 3))
104 |
--------------------------------------------------------------------------------
/CTSNet-selenium.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver # powers the browser interaction
2 | from selenium.webdriver.support.ui import Select # selects menu options
3 | from bs4 import BeautifulSoup # to parse HTML
4 | import csv # to write CSV
5 | import pandas as pd # to see CSV
6 | import time
7 | import os
8 | import random
9 |
10 |
11 | driver = webdriver.PhantomJS()
12 | next_page = "https://www.ctsnet.org/surgeons/surgeons-advanced-search?ln=&fn=&subspecialty=adult_cardiac_surgery&city=&country=gb&province=&o"
13 |
14 | with open("IT-cardi.csv", "a") as f:
15 | csv_w_interv = csv.writer(f)
16 | csv_w_interv.writerow(["Name",
17 | "Hospital",
18 | "Phone",
19 | "Interests",
20 | "Practice-Areas",
21 | "City-Region",
22 | "Country",
23 | "Street", "URL"])
24 |
25 |
26 | for i in range(1000):
27 |
28 | driver.get(next_page)
29 |
30 | soup = BeautifulSoup(driver.page_source, "html5lib")
31 |
32 | try:
33 | next_page = "https://www.ctsnet.org" + \
34 | soup.find('a', {'title': 'Go to next page'})['href']
35 | except:
36 | next_page = ""
37 |
38 | td_a = soup.find_all(
39 | "td", {"class": "views-field views-field-field-contact-last-name"})
40 |
41 | if i == 0:
42 | links = ["https://www.ctsnet.org" +
43 | x.find("a")['href'] for x in td_a[48:]]
44 | else:
45 | links = ["https://www.ctsnet.org" + x.find("a")['href'] for x in td_a]
46 |
47 | for l in links:
48 |
49 | driver.get(l)
50 | soup = BeautifulSoup(driver.page_source, "html5lib")
51 |
52 | try:
53 | name = soup.find('h1', {"class": 'page-title'}).text.strip()
54 | print(name)
55 | except:
56 | continue
57 |
58 | try:
59 | hospital = soup.find(
60 | 'div', {
61 | "class": 'contact-institution'}).text.strip()
62 | except:
63 | continue
64 |
65 | try:
66 | country = soup.find('div',
67 | {"class": 'contact-country'}).text.strip()
68 |
69 | except:
70 | country = ''
71 |
72 | try:
73 | street = soup.find('div', {"class": 'contact-street'}).text.strip()
74 | except:
75 | street = ''
76 |
77 | try:
78 | city = soup.find(
79 | 'div', {
80 | "class": 'contact-city-province-code'}).text.strip()
81 |
82 | except:
83 | city = ''
84 |
85 | try:
86 | phone = soup.find('div', {"class": 'contact-numbers'}).text.strip()
87 | except:
88 | continue
89 |
90 | try:
91 | fields = soup.find(
92 | 'div', {
93 | "class": 'views-field views-field-field-contact-subspecialty'}).text.strip().replace(
94 | '\n', '; ')
95 | except:
96 | fields = ''
97 |
98 | try:
99 |
100 | interests = soup.find(
101 | 'div', {
102 | "class": 'field field--name-field-contact-interest field--type-text-long field--label-hidden'}).text.strip().replace(
103 | '\n', '; ')
104 | except:
105 | interests = ''
106 |
107 | if len(phone) > 0:
108 |
109 | with open("IT-cardi.csv", "a") as f:
110 | csv_w_interv = csv.writer(f)
111 | csv_w_interv.writerow(
112 | [name, hospital, phone, interests, fields, city, country, street, l])
113 |
114 | time.sleep(random.randint(1, 3))
115 | time.sleep(random.randint(1, 3))
116 |
--------------------------------------------------------------------------------
/Doximity-selenium.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver # powers the browser interaction
2 | from selenium.webdriver.support.ui import Select # selects menu options
3 | from bs4 import BeautifulSoup # to parse HTML
4 | import csv # to write CSV
5 | import pandas as pd # to see CSV
6 | import time
7 | import os
8 | import random
9 |
10 |
11 | header = [
12 | 'Name',
13 | 'Title',
14 | 'Hospital',
15 | 'Phone',
16 | 'State',
17 | 'Tags',
18 | 'Summary',
19 | 'Skills',
20 | 'City',
21 | 'Address']
22 |
23 | with open("cardi.csv", "a") as f:
24 | csv_w_electro = csv.writer(f)
25 | csv_w_electro.writerow(header)
26 |
27 | driver = webdriver.PhantomJS()
28 | next_page = "https://www.doximity.com/directory/md/specialty/thoracic-surgery?from_slug=pub%2Fmichael-peter-kaye-md"
29 |
30 | for i in range(1000):
31 |
32 | driver.get(next_page)
33 |
34 | try:
35 | next_page = BeautifulSoup(
36 | driver.page_source, "html5lib").find(
37 | "a", {
38 | "class": "next_page"})['href']
39 | next_page = "https://www.doximity.com" + next_page
40 | except:
41 | next_page = ""
42 |
43 | links = [a.get_attribute(
44 | 'href') for a in driver.find_elements_by_css_selector("ul.list-4-col a")]
45 | links = random.sample(links, 15)
46 |
47 | for l in links:
48 |
49 | driver.get(l)
50 | soup = BeautifulSoup(driver.page_source, "html5lib")
51 |
52 | try:
53 | name = soup.find("span", {"id": "user_full_name"}).text.strip()
54 | print(name)
55 | except:
56 | name = ""
57 |
58 | try:
59 | title = soup.find("p", {"itemprop": "jobTitle"}).text.strip()
60 | except:
61 | title = ""
62 |
63 | try:
64 | city = soup.find(
65 | "span", {
66 | "itemprop": "addressLocality"}).text.strip()
67 | except:
68 | city = ""
69 |
70 | try:
71 | state = soup.find("span",
72 | {"itemprop": "addressRegion"}).text.strip()
73 | except:
74 | state = ""
75 |
76 | try:
77 | address = soup.find("div", {"class": "col-1-2"}).text.strip()
78 | except:
79 | address = ""
80 |
81 | try:
82 | hospital = soup.find("section",
83 | {"class": "section hospital-info"}).findAll("span",
84 | {"itemprop": "name"})
85 | hospitals = '; '.join([x.text.strip() for x in hospital])
86 | except:
87 | hospitals = ""
88 |
89 | try:
90 | phone = soup.find("span", {"itemprop": "telephone"}).text.strip()
91 | except:
92 | phone = ""
93 |
94 | try:
95 | summary = soup.find(
96 | "section", {
97 | "class": "section summary-info"}).find("ul").text.strip()
98 | except:
99 | summary = ""
100 |
101 | try:
102 | skills = soup.find(
103 | "div", {
104 | "class": "section skills-info"}).find("ul").text.strip()
105 | except:
106 | skills = ""
107 |
108 | try:
109 | tags = soup.find("div", {"class": "section"}).find(
110 | "p").text.strip()
111 |
112 | if len(phone) > 0:
113 | if "cardi" in tags.lower():
114 | with open("cardi.csv", "a") as f:
115 | csv_w_electro = csv.writer(f)
116 | csv_w_electro.writerow(
117 | [name, title, hospitals, phone, state, tags, summary, skills, city, address])
118 |
119 | except:
120 | pass
121 |
122 | time.sleep(random.randint(1, 3))
123 |
124 | time.sleep(random.randint(1, 3))
125 |
--------------------------------------------------------------------------------
/RateMyProfessors-selenium.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver # powers the browser interaction
2 | from selenium.webdriver.support.ui import Select # selects menu options
3 | from selenium.webdriver.common.keys import Keys
4 | from bs4 import BeautifulSoup # to parse HTML
5 | import csv # to write CSV
6 | import pandas as pd # to see CSV
7 | import time
8 | import os
9 | import random
10 |
11 |
12 | header = ['Prof_Name',
13 | 'Title',
14 | 'School',
15 | 'Overall_Quality',
16 | 'Overall_Take_Again',
17 | 'Overall_Difficulty',
18 | 'Overall_Hot',
19 | 'Comment_Date',
20 | 'Rating_Type',
21 | 'Course',
22 | 'Quality',
23 | 'Difficulty',
24 | 'Credit',
25 | 'Attendance',
26 | 'Textbook',
27 | 'Take_Again',
28 | 'Grade',
29 | 'Comment',
30 | 'Helpful',
31 | 'Not_Helpful',
32 | 'URL']
33 |
34 | with open("rmp.csv", "a") as f:
35 | csv_w = csv.writer(f)
36 | csv_w.writerow(header)
37 |
38 | base_url = 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid='
39 |
40 | driver = webdriver.PhantomJS()
41 | driver.get(base_url + str(random.randint(1, 500000)))
42 | driver.find_element_by_css_selector('a.btn.close-this').click()
43 |
44 | for i in range(500000):
45 | url = base_url + str(random.randint(1, 500000))
46 | driver.get(url)
47 |
48 | try:
49 | soup = BeautifulSoup(driver.page_source, 'html5lib')
50 | comment_table = soup.find('table', {'class': 'tftable'})
51 | comments = comment_table.find_all('tr')[1:]
52 | except:
53 | continue
54 |
55 | prof_name = ' '.join(
56 | soup.find(
57 | 'h1', {
58 | 'class': 'profname'}).text.strip().split())
59 | print(prof_name)
60 | school = soup.find('a', {'class': 'school'}).text.strip()
61 | title = ' '.join(
62 | soup.find(
63 | 'div', {
64 | 'class': 'result-title'}).text.strip().split()).split(' are you')[0]
65 |
66 | overall = soup.find_all('div', {'class': 'grade'})[:3]
67 | o_quality, o_take_again, o_difficulty = [x.text.strip() for x in overall]
68 | o_hot = soup.find_all('div', {'class': 'grade'})[3].find('img')[
69 | 'src'].split('/')[-1].split('.')[0]
70 |
71 | all_rows = []
72 | for c in comments:
73 | try:
74 | date = c.find('div', {'class': 'date'}).text.strip()
75 | rating_type = c.find('span', {'class': 'rating-type'}).text.strip()
76 | course = c.find('span', {'class': 'name'}).text.strip()
77 | credit = c.find('span', {'class': 'credit'}
78 | ).text.strip().split(':')[1].strip()
79 | attendance = c.find(
80 | 'span', {
81 | 'class': 'attendance'}).text.strip().split(':')[1].strip()
82 | textbook = c.find(
83 | 'span', {
84 | 'class': 'textbook-used'}).text.strip().split(':')[1].strip()
85 | take_again = c.find(
86 | 'span', {
87 | 'class': 'would-take-again'}).text.strip().split(':')[1].strip()
88 | grade = c.find('span', {'class': 'grade'}
89 | ).text.strip().split(':')[1].strip()
90 |
91 | brkdown = c.find(
92 | 'div', {
93 | 'class': 'breakdown'}).find_all(
94 | 'div', {
95 | 'class': 'descriptor-container'})
96 | quality, difficulty = [x.text.strip().split()[0] for x in brkdown]
97 |
98 | helpful = c.find('a', {'class': 'helpful'}).find(
99 | 'span', {'class': 'count'}).text.strip()
100 | not_helpful = c.find(
101 | 'a', {
102 | 'class': 'nothelpful'}).find(
103 | 'span', {
104 | 'class': 'count'}).text.strip()
105 |
106 | comment = c.find('p', {'class': 'commentsParagraph'}).text
107 |
108 | row = [prof_name,
109 | title,
110 | school,
111 | o_quality,
112 | o_take_again,
113 | o_difficulty,
114 | o_hot,
115 | date,
116 | rating_type,
117 | course,
118 | quality,
119 | difficulty,
120 | credit,
121 | attendance,
122 | textbook,
123 | take_again,
124 | grade,
125 | comment,
126 | helpful,
127 | not_helpful,
128 | url]
129 |
130 | all_rows.append(row)
131 |
132 | except:
133 | pass
134 |
135 | with open("rmp.csv", "a") as f:
136 | csv_w = csv.writer(f)
137 | csv_w.writerows(all_rows)
138 |
139 | time.sleep(random.randint(1, 3))
140 |
--------------------------------------------------------------------------------
/DataMartBasicSkills-req.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | from urllib import parse
4 | import json
5 | import pickle
6 | import time
7 | import re
8 | import glob
9 |
10 |
11 | class BasicSkillsCollege:
12 |
13 | def __init__(self, college):
14 |
15 | self.sess = requests.session()
16 | self.url = 'http://datamart.cccco.edu/Outcomes/BasicSkills_Cohort_Tracker.aspx'
17 | self.init_req = self.sess.get(self.url)
18 | self.init_req_soup = BeautifulSoup(self.init_req.content, 'html5lib')
19 | self.init_states = {tag['name']: tag['value']
20 | for tag in self.init_req_soup.select('input[name^=__]')}
21 | self.college = college
22 | print(self.college)
23 |
24 | def parse_params(self, r):
25 | lst = re.search(r'\[.+\]', r.text).group()
26 | terms = lst.replace(
27 | '"',
28 | '').replace(
29 | '[',
30 | '').replace(
31 | ']',
32 | '').replace(
33 | "'",
34 | "").split(',')
35 | terms = [x.strip() for x in terms]
36 |
37 | tps = []
38 | for i in range(len(terms)):
39 | if (i + 2) % 2 == 0:
40 | tps.append((terms[i + 1], terms[i]))
41 |
42 | return tps
43 |
44 | def get_s_terms(self):
45 | data = self.init_states
46 | data['__CALLBACKID'] = 'ASPxRoundPanel1$ASPxComboBoxSTerm'
47 | data['__CALLBACKPARAM'] = 'c0:LECC|0;;LBCRI|4;0:-2;'
48 | data['DXScript'] = '1_243,1_138,1_237,1_164,1_141,1_135,1_226,1_234,1_162,1_170,1_161,1_229,1_159,1_227,1_165,1_143,1_176,1_151,1_232,1_149,7_50,7_53,7_48,7_52,1_235,1_218,1_228,1_210,1_184,1_136'
49 | data['DXCss'] = '0_224,1_28,0_226,0_115,1_10,0_117,0_143,7_2,0_145,../css/styles.css,../css/navigation-mininav.css,../css/design01.css,../css/footer-without-dark-container.css'
50 | data['ASPxRoundPanel1$ASPxComboBoxColl'] = self.college[0]
51 | data['ASPxRoundPanel1_ASPxComboBoxColl_VI'] = self.college[1]
52 | data['ASPxRoundPanel1$ASPxComboBoxColl$DDD$L'] = self.college[1]
53 |
54 | req = self.sess.post(self.url, data=data)
55 |
56 | sterms = self.parse_params(req)
57 | spring_2006 = [x[0] for x in sterms].index('Spring 2006')
58 | sterms = sterms[:spring_2006 + 1][::-1]
59 |
60 | return (data, sterms)
61 |
62 | def get_skills(self):
63 | data, sterms = self.get_s_terms()
64 | data['__CALLBACKID'] = 'ASPxRoundPanel1$ASPxComboBoxBSSub'
65 | data['ASPxRoundPanel1$ASPxComboBoxSTerm'] = sterms[0][0]
66 | data['ASPxRoundPanel1_ASPxComboBoxSTerm_VI'] = sterms[0][1]
67 | data['ASPxRoundPanel1$ASPxComboBoxSTerm$DDD$L'] = sterms[0][1]
68 | data['ASPxRoundPanel1$ASPxComboBoxETerm'] = sterms[0][0]
69 | data['ASPxRoundPanel1_ASPxComboBoxETerm_VI'] = sterms[0][1]
70 | data['ASPxRoundPanel1$ASPxComboBoxETerm$DDD$L'] = sterms[0][1]
71 |
72 | req = self.sess.post(self.url, data=data)
73 | skills = self.parse_params(req)
74 |
75 | return (data, sterms, skills)
76 |
77 | def get_levels(self):
78 | data, sterms, skills = self.get_skills()
79 | college_params = []
80 | for i in range(len(sterms)):
81 | params = {}
82 | for i2 in range(len(sterms) - i):
83 | for i3 in range(len(skills)):
84 | if "ESL" not in skills[i3][0]:
85 | params['sterm'] = sterms[i]
86 | params['eterm'] = sterms[i2 + i]
87 | params['skill'] = skills[i3]
88 | data['__CALLBACKID'] = 'ASPxRoundPanel1$ASPxComboBoxPL'
89 | data['ASPxRoundPanel1$ASPxComboBoxSTerm'] = params[
90 | 'sterm'][0]
91 | data['ASPxRoundPanel1_ASPxComboBoxSTerm_VI'] = params[
92 | 'sterm'][1]
93 | data['ASPxRoundPanel1$ASPxComboBoxSTerm$DDD$L'] = params[
94 | 'sterm'][1]
95 | data['ASPxRoundPanel1$ASPxComboBoxETerm'] = params[
96 | 'eterm'][0]
97 | data['ASPxRoundPanel1_ASPxComboBoxETerm_VI'] = params[
98 | 'eterm'][1]
99 | data['ASPxRoundPanel1$ASPxComboBoxETerm$DDD$L'] = params[
100 | 'eterm'][1]
101 | data['ASPxRoundPanel1$ASPxComboBoxBSSub'] = params[
102 | 'skill'][0]
103 | data['ASPxRoundPanel1_ASPxComboBoxBSSub_VI'] = params[
104 | 'skill'][1]
105 | data['ASPxRoundPanel1$ASPxComboBoxBSSub$DDD$L'] = params[
106 | 'skill'][1]
107 |
108 | req = self.sess.post(self.url, data=data)
109 |
110 | try:
111 | levels = self.parse_params(req)
112 |
113 | for l in levels:
114 | params['sterm'] = sterms[i]
115 | params['eterm'] = sterms[i2 + i]
116 | params['skill'] = skills[i3]
117 | params['level'] = l
118 | college_params.append(params)
119 | params = {}
120 |
121 | except:
122 | pass
123 |
124 | pickle.dump(
125 | college_params,
126 | open('./pickles/' + self.college[0] + '.pkl', 'wb'))
127 | return college_params
128 |
129 | def dl_csv(self):
130 | config = pickle.load(
131 | open(
132 | './pickles/' +
133 | self.college[0] +
134 | '.pkl',
135 | 'rb'))
136 | num_configs = len(config)
137 |
138 | params_json = json.load(open('pickles/dump.HAR'))
139 | params1 = {parse.unquote(d['name']): parse.unquote(d['value']) for d in params_json[
140 | 'log']['entries'][-6]['request']['postData']['params']}
141 | params2 = {parse.unquote(d['name']): parse.unquote(d['value']) for d in params_json[
142 | 'log']['entries'][-1]['request']['postData']['params']}
143 |
144 | headers = {d['name']: d['value'] for d in params_json[
145 | 'log']['entries'][-1]['request']['headers']}
146 | del headers['Content-Length']
147 | del headers['Cookie']
148 |
149 | cookies = {'Cookie': 'ASP.NET_SessionId' + '=' +
150 | self.init_req.cookies.get_dict()['ASP.NET_SessionId']}
151 | self.sess.headers.update(cookies)
152 |
153 | data = self.init_states
154 |
155 | for k in data.keys():
156 | params1[k] = data[k]
157 | params2[k] = data[k]
158 |
159 | for i, c in enumerate(config):
160 | print(i, num_configs, c)
161 |
162 | for p in (params1, params2):
163 | p['ASPxRoundPanel1$ASPxComboBoxColl'] = self.college[0]
164 | p['ASPxRoundPanel1_ASPxComboBoxColl_VI'] = self.college[1]
165 | p['ASPxRoundPanel1$ASPxComboBoxColl$DDD$L'] = self.college[1]
166 | p['ASPxRoundPanel1$ASPxComboBoxSTerm'] = c['sterm'][0]
167 | p['ASPxRoundPanel1_ASPxComboBoxSTerm_VI'] = c['sterm'][1]
168 | p['ASPxRoundPanel1$ASPxComboBoxSTerm$DDD$L'] = c['sterm'][1]
169 | p['ASPxRoundPanel1$ASPxComboBoxETerm'] = c['eterm'][0]
170 | p['ASPxRoundPanel1_ASPxComboBoxETerm_VI'] = c['eterm'][1]
171 | p['ASPxRoundPanel1$ASPxComboBoxETerm$DDD$L'] = c['eterm'][1]
172 | p['ASPxRoundPanel1$ASPxComboBoxBSSub'] = c['skill'][0]
173 | p['ASPxRoundPanel1_ASPxComboBoxBSSub_VI'] = c['skill'][1]
174 | p['ASPxRoundPanel1$ASPxComboBoxBSSub$DDD$L'] = c['skill'][1]
175 | p['ASPxRoundPanel1$ASPxComboBoxPL'] = c['level'][0]
176 | p['ASPxRoundPanel1_ASPxComboBoxPL_VI'] = c['level'][1]
177 | p['ASPxRoundPanel1$ASPxComboBoxPL$DDD$L'] = c['level'][1]
178 |
179 | params2['__EVENTTARGET'] = 'buttonSaveAs'
180 | params2['listExportFormat'] = '1'
181 |
182 | # need to start sesh
183 | r = self.sess.post(self.url, data=params1)
184 |
185 | # now get full report
186 | r = self.sess.post(self.url, data=params2)
187 |
188 | with open("data/" + self.college[0] + '-' + c['sterm'][1] + '-' + c['eterm'][1] + '-' + c['skill'][1] + '-' + c['level'][1] + '.csv', 'w') as f:
189 | f.write(r.text)
190 |
191 | pickle.dump(config[i + 1:],
192 | open('./pickles/' + self.college[0] + '.pkl', 'wb'))
193 |
194 | time.sleep(1)
195 |
196 | if __name__ == "__main__":
197 | colleges = pickle.load(open('./pickles/college_list.pkl', 'rb'))
198 | colleges = colleges[:5]
199 |
200 | for c in colleges:
201 | if not './pickles/' + c[0] + '.pkl' in glob.glob('./pickles/*.pkl'):
202 | BasicSkillsCollege((c[0], c[1])).get_levels()
203 |
204 | BasicSkillsCollege((c[0], c[1])).dl_csv()
205 |
--------------------------------------------------------------------------------
/PACER-selenium.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | import random
3 | import time
4 | from bs4 import BeautifulSoup
5 | import os
6 | import csv
7 | import datetime
8 | from send_email import send_email
9 | import glob
10 | import subprocess
11 | import re
12 | from pyvirtualdisplay import Display
13 | import sys
14 |
15 |
16 | def sift_chars(fname_str):
17 | '''
18 | ensures filename is legal, replaces all with hyphens
19 | '''
20 |
21 | illegal_chars = "%> 30:
98 | break
99 |
100 | time.sleep(1) # case has been found, proceed
101 |
102 | driver.find_element_by_name("date_from").clear()
103 | driver.find_element_by_name("date_from").send_keys("01/01/1990")
104 | driver.find_element_by_name("date_to").clear()
105 | driver.find_element_by_name("date_to").send_keys(
106 | datetime.date.today().strftime("%m/%d/%Y"))
107 |
108 | time.sleep(1)
109 | driver.find_element_by_name('button1').click()
110 |
111 | # get source to get docket info
112 | docket_source = str(driver.page_source)
113 | soup = BeautifulSoup(docket_source, 'html5lib')
114 |
115 | # set start for row, will change if scrape was interrupted
116 | row_start = 0
117 |
118 | # get associated cases if main case
119 | if case_num:
120 |
121 | with open(district + "/" + je_id + "/" + je_id + "_data.csv", 'r', encoding="utf-8") as f:
122 | reader = csv.reader(f)
123 | data = list(reader)
124 |
125 | if len(data) == 1:
126 | get_associated_cases(soup)
127 | # save docket source if main case
128 | with open(district + "/" + je_id + "/" + str(je_id) + ".html", "w", encoding="utf-8") as f:
129 | f.write(docket_source)
130 |
131 | else:
132 | row_start = len(data) - 1
133 |
134 | else:
135 |
136 | if os.path.exists(
137 | district +
138 | "/" +
139 | je_id +
140 | "/associated/" +
141 | str(case_num) +
142 | "/" +
143 | 'assoc_data.csv'):
144 | with open(district + "/" + je_id + "/associated/" + str(case_num) + "/" + 'assoc_data.csv', 'r', encoding="utf-8") as f:
145 | reader = csv.reader(f)
146 | data = list(reader)
147 |
148 | row_start = len(data) - 1
149 |
150 | docket_rows = []
151 | for i in range(len(soup.findAll("table")) - 5):
152 | # table is broken up to sets of 100 rows, don't want first 4 or last
153 | ind = i + 4
154 | docket_table = soup.findAll("table")[ind]
155 | docket_headers = ("Filing Date", "#", "Docket Text")
156 |
157 | # get table info in dict
158 | for row in docket_table.findAll("tr"):
159 | row_data = []
160 | for i, column in enumerate(row.findAll("td")):
161 | if i == 0:
162 | row_data.append(column.text)
163 | elif i == 2:
164 | cell_urls = {}
165 | urls = column.findAll("a")
166 | for u in urls:
167 | cell_urls[u.text.strip()] = u.get("href")
168 |
169 | row_data.append((column.text.strip(), cell_urls))
170 |
171 | elif i > 2:
172 | row_data.append(column.text.strip())
173 |
174 | if len(row_data) > 0:
175 | docket_rows.append(tuple(row_data))
176 |
177 | return docket_rows[row_start:]
178 |
179 |
180 | def process_link(
181 | link_str,
182 | base_url,
183 | district,
184 | already_scraped,
185 | adversary=False,
186 | dock_num=False):
187 | '''
188 | takes any links to documents, and downloads them into file structure
189 | '''
190 |
191 | if link_str.startswith("https://"):
192 | pass
193 | else:
194 | link_str = base_url + link_str
195 |
196 | driver.get(link_str)
197 | f_paths = []
198 |
199 | if "Multiple Documents" in str(driver.page_source):
200 | soup = BeautifulSoup(str(driver.page_source), 'html5lib')
201 | doc_table = soup.findAll("tr")
202 | for r in doc_table:
203 | if "href" in str(r):
204 | tds = r.findAll("td")
205 | doc_url = tds[0].a["href"]
206 | dl_id = doc_url.split("/")[-1]
207 | if dl_id not in already_scraped:
208 | os.system('rm *.pdf')
209 | if doc_url.startswith("https://"):
210 | driver.get(doc_url)
211 | driver.find_element_by_xpath(
212 | '//*[@id="cmecfMainContent"]/form/input').click()
213 | else:
214 | doc_url = base_url + doc_url
215 | driver.get(doc_url)
216 | driver.find_element_by_xpath(
217 | '//*[@id="cmecfMainContent"]/form/input').click()
218 |
219 | file_name = tds[2].text
220 | new_name = sift_chars(file_name.strip()) + ".pdf"
221 |
222 | # if not associated case
223 | # create file structure
224 | if not adversary:
225 | if not os.path.exists(
226 | district + "/" + je_id + "/" + docket_number):
227 | os.makedirs(
228 | district + "/" + je_id + "/" + docket_number)
229 |
230 | new_path = district + "/" + je_id + "/" + docket_number + "/" + new_name
231 |
232 | else:
233 | if not os.path.exists(
234 | district +
235 | "/" +
236 | je_id +
237 | "/associated/" +
238 | adversary +
239 | "/" +
240 | dock_num):
241 | os.makedirs(
242 | district +
243 | "/" +
244 | je_id +
245 | "/associated/" +
246 | adversary +
247 | "/" +
248 | dock_num)
249 |
250 | new_path = district + "/" + je_id + "/associated/" + \
251 | adversary + "/" + dock_num + "/" + new_name
252 |
253 | # wait for file to download
254 | counter = 0
255 | while len(glob.glob("*.pdf")) == 0:
256 | time.sleep(1)
257 | counter += 1
258 | if counter > 500:
259 | break
260 |
261 | time.sleep(4)
262 | download_name = glob.glob("*.pdf")[0]
263 | os.rename(
264 | download_name, re.sub(
265 | r'[^\x00-\x7f]', '-', new_path))
266 |
267 | already_scraped.append(dl_id)
268 | f_paths.append(new_path)
269 |
270 | time.sleep(1)
271 | os.system('rm *.pdf')
272 | time.sleep(1)
273 |
274 | else:
275 | soup = BeautifulSoup(str(driver.page_source), 'html5lib')
276 |
277 | restricted = False
278 |
279 | try:
280 | dl_id = soup.find("form")["action"].split("/")[-1]
281 |
282 | except:
283 | if "The document is restricted" in driver.page_source:
284 | restricted = True
285 | elif "document is not available" in driver.page_source:
286 | restricted = True
287 |
288 | if not restricted:
289 | os.system('rm *.pdf')
290 | driver.find_element_by_xpath(
291 | '//*[@id="cmecfMainContent"]/form/input').click()
292 |
293 | if dl_id not in already_scraped:
294 |
295 | # create file structure
296 | if not adversary:
297 | if not os.path.exists(
298 | district + "/" + je_id + "/" + docket_number):
299 | os.makedirs(
300 | district + "/" + je_id + "/" + docket_number)
301 |
302 | new_path = district + "/" + je_id + "/" + docket_number + "/Main Document.pdf"
303 |
304 | else:
305 | if not os.path.exists(
306 | district +
307 | "/" +
308 | je_id +
309 | "/associated/" +
310 | adversary +
311 | "/" +
312 | dock_num):
313 | os.makedirs(
314 | district +
315 | "/" +
316 | je_id +
317 | "/associated/" +
318 | adversary +
319 | "/" +
320 | dock_num)
321 |
322 | new_path = district + "/" + je_id + "/associated/" + \
323 | adversary + "/" + dock_num + "/Main Document.pdf"
324 |
325 | # wait for file to download
326 | counter = 0
327 | while len(glob.glob("*.pdf")) == 0:
328 | time.sleep(1)
329 | counter += 1
330 | if counter > 500:
331 | break
332 |
333 | time.sleep(4)
334 | download_name = glob.glob("*.pdf")[0]
335 | os.rename(
336 | download_name, re.sub(
337 | r'[^\x00-\x7f]', '-', new_path))
338 |
339 | already_scraped.append(dl_id)
340 | f_paths.append(new_path)
341 |
342 | time.sleep(1)
343 | os.system('rm *.pdf')
344 | time.sleep(1)
345 |
346 | else:
347 | f_paths.append("RESTRICTED")
348 | time.sleep(5)
349 |
350 | return (f_paths, already_scraped)
351 |
352 |
353 | def get_associated_cases(soup):
354 |
355 | ass_exist = True
356 |
357 | try:
358 | ass_cases_ext = soup.findAll("div", {"class": "noprint"})[
359 | 1].find("a")["href"]
360 |
361 | except:
362 | ass_exist = False
363 |
364 | if ass_exist:
365 | driver.get(base_url + ass_cases_ext)
366 | driver.find_element_by_xpath('//*[@id="referrer_form"]/p/a').click()
367 | soup = BeautifulSoup(str(driver.page_source), "html5lib")
368 |
369 | assoc_rows = soup.find("table").findAll("tr")
370 |
371 | if not os.path.exists(district + "/" + je_id + "/" + "associated"):
372 | os.makedirs(district + "/" + je_id + "/" + "associated")
373 |
374 | with open(district + "/" + je_id + "/" + str(je_id) + "_associated_cases.html", "w", encoding="utf-8") as f:
375 | f.write(str(driver.page_source))
376 |
377 | # if interrupted start from where last row
378 | if os.path.exists(
379 | str(district) +
380 | "/" +
381 | str(je_id) +
382 | "/" +
383 | str(je_id) +
384 | '_associated_cases.csv'):
385 | with open(str(district) + "/" + str(je_id) + "/" + str(je_id) + '_associated_cases.csv', 'r', encoding="utf-8") as f:
386 | reader = csv.reader(f)
387 | data = list(reader)
388 |
389 | if len(data) - 1 == len(assoc_rows):
390 | assoc_rows = [assoc_rows[-1]]
391 | else:
392 | assoc_rows = assoc_rows[len(data) - 2:]
393 |
394 | else:
395 | with open(str(district) + "/" + str(je_id) + "/" + str(je_id) + '_associated_cases.csv', 'a', encoding="utf-8") as f:
396 | w = csv.writer(f, delimiter=',')
397 | header = (
398 | "je_id",
399 | "Related Case No",
400 | "Caption",
401 | "Type",
402 | "Judge",
403 | "Plaintiff",
404 | "Defendant",
405 | "Plaintiff Lawyer",
406 | "Defendant Lawyer",
407 | "Date Filed",
408 | "Date Terminated",
409 | "Nature of Suit")
410 | w.writerow(header)
411 |
412 | for row in assoc_rows: # CHANGE FOR FULL
413 | columns = row.findAll("td")
414 | if len(columns) > 0:
415 |
416 | case_ext = columns[1].find("a")["href"]
417 | case_num = columns[1].find("a").text
418 | caption = ' '.join(columns[1].text.split()[1:])
419 | case_type = columns[2].text
420 |
421 | row_to_write = (je_id, case_num, caption, case_type)
422 |
423 | with open(str(district) + "/" + str(je_id) + "/" + str(je_id) + '_associated_cases.csv', 'a', encoding="utf-8") as f:
424 | w = csv.writer(f, delimiter=',')
425 | w.writerow(row_to_write)
426 |
427 | driver.get(base_url + case_ext)
428 |
429 | docket_rows = get_docket_rows(
430 | driver=driver,
431 | case_num=False,
432 | year=False,
433 | court_perl=False)
434 |
435 | if not os.path.exists(
436 | district + "/" + je_id + "/associated/" + case_num):
437 | os.makedirs(
438 | district + "/" + je_id + "/associated/" + case_num)
439 |
440 | with open(district + "/" + je_id + "/associated/" + case_num + "/" + str(case_num) + ".html", "w", encoding="utf-8") as f:
441 | f.write(str(driver.page_source))
442 |
443 | if os.path.exists(
444 | district +
445 | "/" +
446 | je_id +
447 | "/associated/" +
448 | str(case_num) +
449 | "/" +
450 | 'assoc_data.csv'):
451 | with open(district + "/" + je_id + "/associated/" + str(case_num) + "/" + 'assoc_data.csv', 'r', encoding="utf-8") as f:
452 | reader = csv.reader(f)
453 | data = list(reader)
454 |
455 | docket_rows = docket_rows[len(data) - 1:]
456 |
457 | else:
458 | with open(district + "/" + je_id + "/associated/" + str(case_num) + "/" + 'assoc_data.csv', 'a', encoding="utf-8") as f:
459 | w = csv.writer(f, delimiter=',')
460 | header = (
461 | "je_id",
462 | "case_num",
463 | "docket_text",
464 | "docket_number",
465 | "docket_date",
466 | "file_link",
467 | "[lawfirm1]",
468 | "[lawyers1]",
469 | "[lawfirm2]",
470 | "[lawyers2]",
471 | "[lawfirm3]",
472 | "[lawyers3]",
473 | "[moving party]",
474 | "[motion caption]")
475 | w.writerow(header)
476 |
477 | for row in docket_rows: # just 20 rows CHANGE FOR FULL
478 | docket_date = row[0]
479 | docket_text = row[2].strip()
480 | if len(
481 | row[1]) > 1 and len(
482 | row[1][0]) > 0 and row[1][0][0].isdigit():
483 | docket_number = row[1][0].split()[0]
484 |
485 | else:
486 | with open(district + "/" + je_id + "/associated/" + str(case_num) + "/" + 'assoc_data.csv', 'r', encoding="utf-8") as f:
487 | reader = csv.reader(f)
488 | temp_data = list(reader)
489 | docket_number = temp_data[-1][-3]
490 |
491 | already_scraped = []
492 | paths = []
493 | for c in row:
494 | if len(c) > 1 and isinstance(
495 | c[1], dict) and len(
496 | c[1]) > 0:
497 | for k in c[1].keys():
498 | url = c[1][k]
499 | res = process_link(
500 | link_str=url,
501 | base_url=base_url,
502 | district=district,
503 | already_scraped=already_scraped,
504 | dock_num=docket_number,
505 | adversary=case_num)
506 | file_paths = res[0]
507 | if len(file_paths) > 0:
508 | already_scraped = res[1]
509 | paths.extend(file_paths)
510 |
511 | # wait after each link call
512 | time.sleep(random.randint(1, 3))
513 |
514 | csv_row = [
515 | je_id,
516 | case_num,
517 | docket_text,
518 | docket_number,
519 | docket_date,
520 | "; ".join(paths)]
521 | scraped_data[district].append(csv_row)
522 |
523 | with open(district + "/" + je_id + "/associated/" + str(case_num) + "/" + 'assoc_data.csv', 'a', encoding="utf-8") as f:
524 | w = csv.writer(f, delimiter=',')
525 | w.writerow(csv_row)
526 |
527 | time.sleep(random.randint(1, 3))
528 |
529 |
530 | # In[ ]:
531 |
532 | # main program
533 | # for case num info
534 | with open('dataset.csv', 'r', encoding="utf-8") as f:
535 | reader = csv.reader(f)
536 | data = list(reader)
537 |
538 | with open('distlogin.csv', 'r', encoding="utf-8") as f:
539 | reader = csv.reader(f)
540 | distlogin_csv = list(reader)
541 |
542 | with open('completed', 'r') as f:
543 | completed_cases = f.read().split('\n')
544 |
545 | email_address = distlogin_csv[0][0]
546 | email_password = distlogin_csv[0][1]
547 | dl_directory = distlogin_csv[0][2]
548 | district = distlogin_csv[1][0]
549 |
550 | # change for each district
551 | dist_data = [x for x in data if x[-2] == district]
552 | district = ''.join(district.split())
553 |
554 | distlogin = {}
555 |
556 | for r in distlogin_csv[1:]:
557 | distlogin[district] = {"login": r[1],
558 | "pw": r[2],
559 | "base_url": r[3]}
560 |
561 | # prepare and loop
562 | scraped_data = {}
563 | scraped_data[district] = []
564 |
565 | if not os.path.exists(district):
566 | os.makedirs(district)
567 |
568 | driver = login_to_pacer(
569 | login_user=distlogin[district]["login"],
570 | login_password=distlogin[district]["pw"],
571 | dl_directory=dl_directory)
572 |
573 | for case in dist_data: # just two cases CHANGE FOR FULL
574 |
575 | print(datetime.datetime.time(datetime.datetime.now()))
576 |
577 | company = case[0]
578 | je_id = case[1]
579 | case_num = case[2]
580 | petition_date = case[3]
581 | year = case[6]
582 |
583 | if je_id not in completed_cases:
584 |
585 | send_email(
586 | email_address,
587 | email_password,
588 | email_address,
589 | "New Case",
590 | "JEID" +
591 | str(je_id))
592 |
593 | if not os.path.exists(district + "/" + je_id):
594 | os.makedirs(district + "/" + je_id)
595 |
596 | if not os.path.exists(
597 | district +
598 | "/" +
599 | je_id +
600 | "/" +
601 | je_id +
602 | "_data.csv"):
603 | # for output data
604 | with open(district + "/" + je_id + "/" + je_id + "_data.csv", 'w', encoding="utf-8") as f:
605 | w = csv.writer(f, delimiter=',')
606 | header = (
607 | "Company",
608 | "je_id",
609 | "petition_date",
610 | "casenum",
611 | "xdistfiled",
612 | "docket_text",
613 | "docket_number",
614 | "docket_date",
615 | "file_link",
616 | "[lawfirm1]",
617 | "[lawyers1]",
618 | "[lawfirm2]",
619 | "[lawyers2]",
620 | "[lawfirm3]",
621 | "[lawyers3]",
622 | "[moving party]",
623 | "[motion caption]")
624 | w.writerow(header)
625 |
626 | # change for each district
627 | base_url = distlogin[district]["base_url"]
628 | court_perl = base_url + "/cgi-bin/DktRpt.pl"
629 | docket_rows = get_docket_rows(
630 | driver=driver,
631 | case_num=case_num,
632 | year=year,
633 | court_perl=court_perl)
634 |
635 | for row in docket_rows: # just 20 rows CHANGE FOR FULL
636 | docket_date = row[0]
637 | docket_text = row[2].strip()
638 | if len(
639 | row[1]) > 1 and len(
640 | row[1][0]) > 0 and row[1][0][0].isdigit():
641 | docket_number = row[1][0].split()[0]
642 | else:
643 | with open(district + "/" + je_id + "/" + je_id + "_data.csv", 'r', encoding="utf-8") as f:
644 | reader = csv.reader(f)
645 | temp_data = list(reader)
646 | docket_number = temp_data[-1][-3]
647 |
648 | already_scraped = []
649 | paths = []
650 | for c in row:
651 | if len(c) > 1 and isinstance(c[1], dict) and len(c[1]) > 0:
652 | for k in c[1].keys():
653 | url = c[1][k]
654 | res = process_link(
655 | link_str=url,
656 | base_url=base_url,
657 | district=district,
658 | already_scraped=already_scraped)
659 | file_paths = res[0]
660 | if len(file_paths) > 0:
661 | already_scraped = res[1]
662 | paths.extend(file_paths)
663 |
664 | # wait after each link call
665 | time.sleep(random.randint(1, 3))
666 |
667 | csv_row = [
668 | company,
669 | je_id,
670 | petition_date,
671 | case_num,
672 | district,
673 | docket_text,
674 | docket_number,
675 | docket_date,
676 | "; ".join(paths)]
677 | scraped_data[district].append(csv_row)
678 |
679 | with open(district + "/" + je_id + "/" + je_id + "_data.csv", 'a', encoding="utf-8") as f:
680 | w = csv.writer(f, delimiter=',')
681 | w.writerow(csv_row)
682 |
683 | with open('completed', 'a') as f:
684 | f.write('\n' + je_id)
685 |
686 | # zip and push to box
687 | p = subprocess.Popen(['bash',
688 | 'zip-push.sh',
689 | je_id],
690 | stdin=None,
691 | stdout=None,
692 | stderr=None,
693 | close_fds=True)
694 |
695 | send_email(
696 | email_address,
697 | email_password,
698 | email_address,
699 | "Finished",
700 | "Done scraping." +
701 | str(je_id))
702 | print(datetime.datetime.time(datetime.datetime.now()))
703 |
--------------------------------------------------------------------------------