├── NCDC-NOAA-wget.sh ├── GLERLNOAA-wget.sh ├── EnergyAutomatedRegister-wget.sh ├── GoogleDirectionsBuildURL-API.py ├── Kalamazoo-BS.py ├── GoogleSearch-dry.py ├── CrunchBase-API.py ├── Grimm-BS.py ├── FRMinutesDiscoutRate-BS.py ├── GLORecords-dry.py ├── GoogleGeocodeSearches.py ├── OpenSecrets-API.py ├── Perrault-BS.py ├── SFPlanning-BS.py ├── WebofScience-API.py ├── LICENSE ├── DataSFFireIncidents-API.r ├── GlassDoor-API.py ├── NationalStolenArtFile-BS.py ├── WikipediaRevisionHistory-API.py ├── INOCAR-selenium.py ├── LucidChart-BS.py ├── MRIPNOAA-selenium.py ├── .gitignore ├── Kiva-API.py ├── BGG-BS.py ├── BoardGameCapital-selenium.py ├── IMSDB-BS.py ├── NHSTrustsInfo-BS.py ├── INOCAR-AJAX.py ├── GoogleGeoLatLong-API.py ├── LARRP-BS.py ├── Wiktionary-API.py ├── ADA-ERP-BS.py ├── RioGrandeGames-selenium.py ├── STNMFSNOAA-BS.py ├── BAAD-BS.py ├── README.md ├── BSBDigitaleSammlungen-API.py ├── AHA-selenium.py ├── ResidentAdvisor-selenium.py ├── CTSNet-selenium.py ├── Doximity-selenium.py ├── RateMyProfessors-selenium.py ├── DataMartBasicSkills-req.py └── PACER-selenium.py /NCDC-NOAA-wget.sh: -------------------------------------------------------------------------------- 1 | wget -r ftp://data.ncdc.noaa.gov/cdr/solar-irradiance/tsi/ -------------------------------------------------------------------------------- /GLERLNOAA-wget.sh: -------------------------------------------------------------------------------- 1 | wget -r --no-parent https://www.glerl.noaa.gov//metdata/status/status_archive/ -------------------------------------------------------------------------------- /EnergyAutomatedRegister-wget.sh: -------------------------------------------------------------------------------- 1 | wget https://www.energy.gov/eere/downloads/automated-register-implemented-actions 2 | mv automated-register-implemented-actions automated-register-implemented-actions.html 3 | wget https://www.energy.gov/sites/prod/files/2016/07/f33/Automated%20Register%20V1.0.2.xlsx 4 | wget https://www.energy.gov/sites/prod/files/2016/07/f33/Automated%20Register%20V1.0.2%20User%20Manual.pdf -------------------------------------------------------------------------------- /GoogleDirectionsBuildURL-API.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | base = "https://www.google.com/maps/dir/" 4 | 5 | locs = [ 6 | "305 Harrison St, seattle, WA 98109, USA", 7 | "san francisco airport", 8 | "UC berkeley", 9 | "stanford university"] 10 | 11 | # API STUFF HERE 12 | 13 | ordered_locs = [] 14 | 15 | final_url = base 16 | 17 | for l in locs: 18 | final_url += '+'.join(l.split()) + "/" 19 | 20 | print(final_url) 21 | -------------------------------------------------------------------------------- /Kalamazoo-BS.py: -------------------------------------------------------------------------------- 1 | from urllib.request import Request, urlopen 2 | from bs4 import BeautifulSoup 3 | 4 | urls = [] 5 | for i in range(1035, 1053): 6 | urls.append( 7 | "http://scholarworks.wmich.edu/cgi/viewcontent.cgi?article=" + 8 | str(i) + 9 | "&context=medieval_cong_archive") 10 | 11 | for i, url in enumerate(urls): 12 | res = urlopen(Request(url)) 13 | pdf = open(("kzoo/kalamazoo_" + str(i) + ".pdf"), 'wb') 14 | pdf.write(res.read()) 15 | pdf.close() 16 | -------------------------------------------------------------------------------- /GoogleSearch-dry.py: -------------------------------------------------------------------------------- 1 | import dryscrape 2 | import sys 3 | 4 | 5 | search_term = 'testing' 6 | 7 | # set up a web scraping session 8 | sess = dryscrape.Session(base_url='http://google.com') 9 | 10 | # we don't need images 11 | sess.set_attribute('auto_load_images', False) 12 | 13 | # visit homepage and search for a term 14 | sess.visit('/') 15 | q = sess.at_xpath('//*[@name="q"]') 16 | q.set(search_term) 17 | q.form().submit() 18 | 19 | # extract all links 20 | for link in sess.xpath('//a[@href]'): 21 | print(link) 22 | print(link['href']) 23 | 24 | # # save a screenshot of the web page 25 | # sess.render('google.png') 26 | # print("Screenshot written to 'google.png'") 27 | -------------------------------------------------------------------------------- /CrunchBase-API.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from __future__ import division 4 | import math 5 | import csv 6 | 7 | # set key 8 | key = "PUT_KEY_HERE" 9 | 10 | # set base url 11 | base_url = "https://api.crunchbase.com/v/3/organizations" 12 | 13 | # set response format 14 | response_format = ".json" 15 | 16 | # set search parameters 17 | search_params = {"name": "uber", 18 | "user_key": key, 19 | "page": "1"} 20 | 21 | # make request 22 | r = requests.get(base_url + response_format, params=search_params) 23 | response_text = r.text 24 | 25 | # Convert JSON response to a dictionary 26 | data = json.loads(response_text) 27 | 28 | print(data.keys()) 29 | -------------------------------------------------------------------------------- /Grimm-BS.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | import pandas as pd 5 | 6 | soup = BeautifulSoup(requests.get( 7 | "https://www.cs.cmu.edu/~spok/grimmtmp/").text, 'html5lib') 8 | 9 | titles = [x.text.strip() for x in soup.find_all("li")] 10 | 11 | base = 'https://www.cs.cmu.edu/~spok/grimmtmp/' 12 | rows = [] 13 | 14 | for i in range(1, 210): 15 | 16 | url = 'https://www.cs.cmu.edu/~spok/grimmtmp/{}.txt'.format( 17 | str(i).zfill(3)) 18 | 19 | text = requests.get(url).text.strip() 20 | 21 | rows.append([titles[i - 1], text]) 22 | 23 | time.sleep(1) 24 | 25 | df = pd.DataFrame(rows, columns=['Title', 'Text']) 26 | df.to_csv("grimm.csv", index=False) 27 | -------------------------------------------------------------------------------- /FRMinutesDiscoutRate-BS.py: -------------------------------------------------------------------------------- 1 | from urllib.request import Request, urlopen 2 | from bs4 import BeautifulSoup 3 | 4 | 5 | html = urlopen("http://www.federalreserve.gov/monetarypolicy/discountrate.htm") 6 | bsObj = BeautifulSoup(html.read(), "lxml") 7 | d1 = bsObj.findAll("option") 8 | 9 | urls = [] 10 | for item in d1: 11 | if "PDF" in str(item.get_text()): 12 | prefix = "http://www.federalreserve.gov" 13 | url = prefix + str(item['value']) 14 | urls.append((url, str(item.get_text()))) 15 | 16 | urls = urls[:3] 17 | 18 | print(len(urls)) 19 | 20 | for url in urls: 21 | res = urlopen(Request(url[0])) 22 | pdf = open((url[1] + ".pdf"), 'wb') 23 | pdf.write(res.read()) 24 | pdf.close() 25 | -------------------------------------------------------------------------------- /GLORecords-dry.py: -------------------------------------------------------------------------------- 1 | import dryscrape 2 | import sys 3 | from urllib.request import Request, urlopen 4 | from bs4 import BeautifulSoup 5 | import time 6 | 7 | 8 | urls = ["http://www.glorecords.blm.gov"] 9 | ext = "/ConvertedImages/CV_Patent_0123-207.PDF" 10 | 11 | for url in urls: 12 | # set up a web scraping session 13 | sess = dryscrape.Session(base_url=url) 14 | 15 | # we don't need images 16 | sess.set_attribute('auto_load_images', True) 17 | 18 | # visit homepage and search for a term 19 | sess.visit(ext) 20 | time.sleep(15) 21 | # sess.render('sshot.png') 22 | 23 | res = urlopen(Request(url + ext)) 24 | pdf = open((url[1] + ".pdf"), 'wb') 25 | pdf.write(res.read()) 26 | pdf.close() 27 | -------------------------------------------------------------------------------- /GoogleGeocodeSearches.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import urllib 3 | import time 4 | 5 | searches = ['UC Berkeley', 'University of Minnesota', 'Middlebury College'] 6 | 7 | latitude = [] 8 | longitude = [] 9 | for s in searches: 10 | search = urllib.parse.quote(s) 11 | 12 | print(s) 13 | 14 | try: 15 | json_res = requests.get( 16 | 'https://maps.googleapis.com/maps/api/geocode/json?address={}'.format(search)).json() 17 | coordinates = json_res['results'][0]['geometry']['location'] 18 | latitude.append(coordinates['lat']) 19 | longitude.append(coordinates['lng']) 20 | except: 21 | latitude.append(None) 22 | longitude.append(None) 23 | 24 | time.sleep(.5) 25 | 26 | print(list(zip(latitude, longitude))) 27 | -------------------------------------------------------------------------------- /OpenSecrets-API.py: -------------------------------------------------------------------------------- 1 | import json 2 | from urllib.request import Request, urlopen 3 | 4 | 5 | def getJson(func, apikey, params): 6 | url = 'http://www.opensecrets.org/api/?method=%s&output=json&%s&apikey=%s' % \ 7 | (func, params, apikey) 8 | 9 | req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) 10 | 11 | response = urlopen(req).read().decode('utf-8') 12 | responseJson = json.loads(response) 13 | 14 | return responseJson 15 | 16 | func = "getOrgs" 17 | apikey = "" 18 | params = "org=Exxon" 19 | 20 | info = getJson(func, apikey, params) 21 | 22 | print(info) 23 | 24 | orgid = info.get("response").get("organization")[ 25 | 0].get("@attributes").get("orgid") 26 | 27 | func = "orgSummary" 28 | params = "id=" + orgid 29 | summary = getJson(func, apikey, params) 30 | 31 | print(summary) 32 | -------------------------------------------------------------------------------- /Perrault-BS.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | import pandas as pd 5 | 6 | rows = [] 7 | for i in range(1, 12): 8 | 9 | url = 'http://www.pitt.edu/~dash/perrault{}.html'.format( 10 | str(i).zfill(2)) 11 | 12 | soup = BeautifulSoup(requests.get(url).text, 'html5lib') 13 | 14 | title = soup.find('h1').text.strip() 15 | text = '\n'.join([p.text for p in soup.find_all('p')[:-1]]) 16 | try: 17 | text += soup.find('blockquote').text 18 | except: 19 | pass 20 | 21 | bullets = soup.find_all('li') 22 | for b in bullets: 23 | if "aarne" in b.text.lower(): 24 | at = ''.join([ch for ch in b.text if ch.isnumeric()]) 25 | 26 | rows.append([title, at, text]) 27 | 28 | time.sleep(1) 29 | 30 | df = pd.DataFrame(rows, columns=['Title', 'Aarne-Thompson', 'Text']) 31 | df.to_csv("perrault.csv", index=False) 32 | -------------------------------------------------------------------------------- /SFPlanning-BS.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | 5 | all_links = [] 6 | res = requests.get('http://default.sfplanning.org/meetingarchive/planning_dept/sf-planning.org/index.aspx-page=1000.html') 7 | soup = BeautifulSoup(res.text, 'lxml') 8 | 9 | # build date links 10 | base = 'http://default.sfplanning.org/meetingarchive/planning_dept/sf-planning.org/' 11 | links = [base + a['href'] for a in soup.find('div', {'id': 'ctl00_content_Screen'})('a')] 12 | 13 | # collect nested links 14 | for l in links: 15 | res = requests.get(l) 16 | soup = BeautifulSoup(res.text, 'lxml') 17 | 18 | links = [base + a['href'] for a in soup.find('div', {'id': 'ctl00_content_Screen'})('a')] 19 | all_links.extend(links) 20 | time.sleep(1) 21 | 22 | # save HTML response for all links 23 | for l in all_links: 24 | html = requests.get(l).text 25 | name = l.split('=')[-1] 26 | print(name) 27 | with open('sfplanning/' + name, 'w') as f: 28 | f.write(html) 29 | time.sleep(1) 30 | -------------------------------------------------------------------------------- /WebofScience-API.py: -------------------------------------------------------------------------------- 1 | from wos import WosClient 2 | import wos.utils 3 | import time 4 | 5 | # must be on campus with access 6 | with WosClient('') as client: 7 | journals = ["Science"] 8 | years = range(2000, 2001) 9 | for journal in journals: 10 | for year in years: 11 | 12 | rf = wos.utils.recordsFound( 13 | client, 'PY=' + str(year) + ' AND SO=' + journal) 14 | 15 | for num in range(1, rf, 100): 16 | 17 | info = wos.utils.query( 18 | client, 19 | 'PY=' + 20 | str(year) + 21 | ' AND SO=' + 22 | journal, 23 | count=100, 24 | frecord=num) 25 | 26 | with open("data/" + str(year) + '-' + journal + ' ' + str(num) + ".xml", "w") as f: 27 | f.write(str(info.encode('utf-8'))) 28 | 29 | time.sleep(2) 30 | 31 | # http://ipscience-help.thomsonreuters.com/wosWebServicesLite/WebServiceOperationsGroup/WebServiceOperations/g2/user_query/field_tags/WOSfieldTags.html 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Christopher Hench 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /DataSFFireIncidents-API.r: -------------------------------------------------------------------------------- 1 | ################################################## 2 | ## Project: Collect API data on fire incidents 3 | ## Author: Christopher Hench 4 | ################################################## 5 | 6 | flatten_json <- function(df) { 7 | 8 | for (col in names(df)) { 9 | if (is.list(df[[col]])) { 10 | i <- 1 11 | for (row in df[[col]]) { 12 | 13 | df[[col]][i] <- paste(row, collapse = '; ') 14 | i <- i + 1 15 | } 16 | df[[col]] <- unlist(df[[col]]) 17 | } 18 | } 19 | return (df) 20 | } 21 | 22 | base_url <- 'https://data.sfgov.org/resource/wbb6-uh78.json?' 23 | 24 | incident_date <- '2017-10-22T00:00:00.000' 25 | incident_date <- URLencode(URL = incident_date, reserved = TRUE) 26 | 27 | get_request <- paste0(base_url, "incident_date=", incident_date) 28 | print(get_request) 29 | 30 | response <- httr::GET(url = get_request) 31 | response <- httr::content(x = response, as = "text") 32 | response_df <- data.frame(jsonlite::fromJSON(txt = response, simplifyDataFrame = TRUE, flatten = TRUE)) 33 | 34 | flattened <- flatten_json(response_df) 35 | 36 | write.csv(flattened, file='fire-incidents.csv') -------------------------------------------------------------------------------- /GlassDoor-API.py: -------------------------------------------------------------------------------- 1 | # https://pypi.python.org/pypi/glassdoor 2 | # http://stackoverflow.com/questions/30956891/rest-glassdoor-api-requires-user-agent-in-header 3 | import urllib.request as request 4 | import requests 5 | import json 6 | from collections import OrderedDict 7 | 8 | # authentication information & other request parameters 9 | params_gd = OrderedDict({ 10 | "v": "1", 11 | "format": "json", 12 | "t.p": "", 13 | "t.k": "", 14 | "action": "employers", 15 | "employerID": "11111", 16 | # programmatically get the IP of the machine 17 | "userip": json.loads(request.urlopen("http://ip.jsontest.com/").read().decode('utf-8'))['ip'], 18 | "useragent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36" 19 | }) 20 | 21 | # construct the URL from parameters 22 | basepath_gd = 'http://api.glassdoor.com/api/api.htm' 23 | 24 | # request the API 25 | response_gd = requests.get( 26 | basepath_gd, params=params_gd, headers={ 27 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36"}) 28 | 29 | # check the response code (should be 200) & the content 30 | response_gd 31 | data = json.loads(response_gd.text) 32 | 33 | print(data["response"]["employers"][0].keys()) 34 | -------------------------------------------------------------------------------- /NationalStolenArtFile-BS.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | import pickle 5 | import pandas as pd 6 | 7 | headers = { 8 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} 9 | artworks = [] 10 | for i in range(0, 7200, 100): 11 | print(i) 12 | url = 'https://www.fbi.gov/investigate/violent-crime/art-theft/national-stolen-art-file?b_start:int=' + \ 13 | str(i) 14 | res = requests.get(url, headers) 15 | soup = BeautifulSoup(res.text, 'html5lib') 16 | 17 | for i in soup.find_all('li', {'class': 'grid-item'}): 18 | 19 | art = {} 20 | art['title'] = i.find('h3').text 21 | art['description'] = i.find('p').text 22 | 23 | try: 24 | art['image_link'] = i.find('img')['src'] 25 | except: 26 | art['image_link'] = 'None' 27 | 28 | keys = [x.text for x in i.find_all('b')] 29 | values = [x.text for x in i.find_all('span')] 30 | 31 | for t in list(zip(keys, values)): 32 | art[t[0]] = t[1] 33 | 34 | artworks.append(art) 35 | 36 | pickle.dump(artworks, open('artworks.pkl', 'wb')) 37 | time.sleep(5) 38 | 39 | pd.DataFrame(artworks).to_csv('artworks.csv', index=False) 40 | -------------------------------------------------------------------------------- /WikipediaRevisionHistory-API.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | import time 4 | import re 5 | from bs4 import BeautifulSoup 6 | import json 7 | import pickle 8 | 9 | 10 | def get_revisions(page_title, num_rev): 11 | url = "https://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions&rvprop=ids|flags|timestamp|comment|user|content|tags|flags&rvlimit=1&rvdiffto=prev&titles=" + page_title 12 | revisions = [] 13 | next_request = '' # information for the next request 14 | 15 | # while True: 16 | for i in range(num_rev): 17 | response = json.loads( 18 | requests.get( 19 | url + 20 | next_request).text) # web request 21 | 22 | page_id = list(response['query']['pages'].keys())[0] 23 | revisions.append( 24 | response['query']['pages'][ 25 | str(page_id)]['revisions'][0]) 26 | 27 | cont = response['continue']['rvcontinue'] 28 | if not cont: # break the loop if 'continue' element missing 29 | break 30 | 31 | # gets the revision Id from which to start the next request 32 | next_request = "&rvcontinue=" + cont 33 | 34 | time.sleep(1) 35 | 36 | return revisions 37 | 38 | 39 | page_names = pickle.load(open('page_names.pkl', 'rb')) 40 | 41 | for p in page_names: 42 | print(p) 43 | results = get_revisions(p, 200) 44 | pickle.dump(results, open('pickles/' + p + '.pkl', 'wb')) 45 | -------------------------------------------------------------------------------- /INOCAR-selenium.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium import webdriver 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | from selenium.webdriver.support import expected_conditions as EC 6 | from selenium.common.exceptions import TimeoutException 7 | from selenium.webdriver.support.ui import Select 8 | from bs4 import BeautifulSoup 9 | 10 | 11 | def init_driver(): 12 | driver = webdriver.Chrome() 13 | driver.wait = WebDriverWait(driver, 5) 14 | return driver 15 | 16 | 17 | def lookup(driver, query): 18 | driver.get("http://www.inocar.mil.ec/mareas/pagina_mareas.php") 19 | # a = driver.wait.until(EC.presence_of_element_located((By.NAME, 20 | # "id_puerto"))) 21 | driver.find_element_by_xpath( 22 | "//select[@name='id_puerto']/option[@value='378']").click() 23 | driver.find_element_by_xpath( 24 | "//select[@name='dias']/option[@value='1']").click() 25 | driver.find_element_by_xpath( 26 | "//select[@name='mes']/option[@value='1']").click() 27 | driver.find_element_by_xpath( 28 | "//select[@name='anio']/option[@value='2015']").click() 29 | driver.find_element_by_name("Submit").click() 30 | 31 | html = driver.page_source 32 | soup = BeautifulSoup(html, 'lxml') 33 | a = soup.findAll("div") 34 | print(a) 35 | 36 | if __name__ == "__main__": 37 | driver = init_driver() 38 | lookup(driver, "Selenium") 39 | time.sleep(5) 40 | driver.quit() 41 | -------------------------------------------------------------------------------- /LucidChart-BS.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import json 3 | import csv 4 | import sys 5 | 6 | # read in html source of chart 7 | html_path = sys.argv[1] 8 | with open(html_path, "r") as f: 9 | html = f.read() 10 | 11 | soup = BeautifulSoup(html, "lxml") 12 | 13 | # find line of JSON data 14 | raw_data = str(soup)[ 15 | str(soup).find("var doc = ") + 16 | len("var doc = "):str(soup).find(";\n doc.Document.state = doc.Document.state")] 17 | 18 | figure_data = json.loads(raw_data) 19 | 20 | # get states JSON 21 | states = json.loads(figure_data["Document"]['state']) 22 | 23 | 24 | def find_corr_text(thread_id, soup): 25 | ''' 26 | find the text from a ThreadId 27 | ''' 28 | item_id = states['Threads'][thread_id]["ItemId"] 29 | loc = str(soup).find(item_id) 30 | end = str(soup)[loc:].find("}}") 31 | raw = str(soup)[loc + len(item_id) + 3:][:end - \ 32 | len(item_id) - 1].replace("\\", "") 33 | 34 | try: 35 | props = json.loads(raw) 36 | text = props["Properties"]["Text"]['t'] 37 | except: 38 | return None 39 | 40 | return text 41 | 42 | # cycle through comments and add text 43 | rows = [] 44 | for k in states['Comments'].keys(): 45 | states['Comments'][k]['text'] = find_corr_text( 46 | states['Comments'][k]['ThreadId'], soup) 47 | rows.append(states['Comments'][k]) 48 | 49 | # write csv 50 | with open('lucidchart-comments.csv', 'w') as f: 51 | w = csv.DictWriter(f, list(set(list(rows[0].keys()) + ['Type']))) 52 | w.writeheader() 53 | w.writerows(rows) 54 | -------------------------------------------------------------------------------- /MRIPNOAA-selenium.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import requests 4 | from bs4 import BeautifulSoup 5 | from selenium import webdriver 6 | from selenium.webdriver.common.by import By 7 | from selenium.webdriver.support.ui import WebDriverWait 8 | from selenium.webdriver.support import expected_conditions as EC 9 | from urllib.request import Request, urlopen 10 | import time 11 | 12 | 13 | driver = webdriver.Chrome() # needs chromedriver in PATH 14 | 15 | # iframed into 16 | # http://www.st.nmfs.noaa.gov/recreational-fisheries/MRIP/mrip-project 17 | driver.get("https://www.st.nmfs.noaa.gov/pims/#view=public_page&program_id=1") 18 | 19 | time.sleep(15) 20 | 21 | for i in range(11): 22 | 23 | projects = [] 24 | 25 | for i in driver.find_elements_by_class_name("dijitTitlePaneTextNode"): 26 | os.mkdir("MRIP/" + i.text) 27 | projects.append(i.text) 28 | 29 | content_pane = driver.find_elements_by_class_name("dijitContentPane")[0] 30 | links = content_pane.find_elements_by_class_name("docLink") 31 | if len(links) > 0: 32 | project_ct = -1 33 | for l in links: 34 | if l.text == "Proposal": # begins each new project 35 | project_ct += 1 36 | with open("MRIP/" + projects[project_ct] + "/" + "source.html", 'w') as f: 37 | f.write(str(driver.page_source)) 38 | 39 | res = urlopen(Request(l.get_attribute("href"))) 40 | with open("MRIP/" + projects[project_ct] + "/" + l.text + ".pdf", 'wb') as pdf: 41 | pdf.write(res.read()) 42 | 43 | time.sleep(1) 44 | 45 | driver.find_element_by_id("dijit_form_Button_4_label").click() 46 | time.sleep(1) 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /Kiva-API.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import time 4 | 5 | status = ["funded", "expired"] 6 | 7 | all_loans = [] 8 | 9 | for s in status: 10 | for i in range(1, 2): # change range to 10000 11 | # set base url 12 | base_url = "http://api.kivaws.org/v1/loans/search" 13 | 14 | # set response format 15 | response_format = ".json" 16 | 17 | # set search parameters 18 | search_params = {"status": s, 19 | "sort_by": "newest", 20 | "page": i} 21 | 22 | # make request 23 | r = requests.get(base_url + response_format, params=search_params) 24 | time.sleep(1.1) 25 | response_text = r.text 26 | 27 | # Convert JSON response to a dictionary 28 | data = json.loads(response_text) 29 | 30 | last_date = data["loans"][-1]["posted_date"] 31 | 32 | if "2016" in last_date[:4]: 33 | for l in data["loans"]: 34 | l_id = str(l["id"]) 35 | 36 | # set base url 37 | base_url = "http://api.kivaws.org/v1/loans/" 38 | 39 | # set response format 40 | response_format = ".json" 41 | 42 | # make request 43 | r = requests.get(base_url + l_id + response_format) 44 | time.sleep(1.1) 45 | response_text = r.text 46 | 47 | # Convert JSON response to a dictionary 48 | detailed_data = json.loads(response_text) 49 | final_data = detailed_data["loans"][0] 50 | 51 | r = requests.get(base_url + l_id + "/teams" + response_format) 52 | time.sleep(1.1) 53 | response_text = r.text 54 | team_data = json.loads(response_text) 55 | final_data["team_count"] = len(team_data["teams"]) 56 | 57 | all_loans.append(final_data) 58 | 59 | else: 60 | break 61 | 62 | json.dump(all_loans, open("kiva_data.json", "w")) 63 | -------------------------------------------------------------------------------- /BGG-BS.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import urllib.parse 3 | from bs4 import BeautifulSoup 4 | import requests 5 | import json 6 | import time 7 | 8 | df = pd.read_csv('game_data.csv') 9 | game_names = set([x.replace(' Rules', '') for x in df['Title']]) 10 | print(len(game_names)) 11 | 12 | all_dicts = [] 13 | for g in game_names: 14 | game = {'Title': g} 15 | 16 | enc = urllib.parse.quote_plus(g) 17 | search_url = 'https://boardgamegeek.com/geeksearch.php?action=search&objecttype=boardgame&q={}&B1=Go'.format( 18 | enc) 19 | 20 | print(search_url) 21 | 22 | res = requests.get(search_url).text 23 | soup = BeautifulSoup(res, 'html5lib') 24 | 25 | first_result = soup.find('tr', {'id': 'row_'}) 26 | 27 | try: 28 | metadata = [ 29 | x.text.strip().replace( 30 | '\n', 31 | ' ').replace( 32 | '\t', 33 | '').replace( 34 | ' ', 35 | ' ') for x in first_result.find_all('td')] 36 | game['rank'], game['name'], game['geek_rating'], game[ 37 | 'avg_rating'], game['voters'] = [metadata[0]] + metadata[2:-1] 38 | sub_url = 'https://boardgamegeek.com' + \ 39 | first_result.find_all('td')[2].find('a')['href'] 40 | 41 | for l in requests.get(sub_url).text.split('\n'): 42 | if l.strip().startswith('GEEK.geekitemPreload'): 43 | data = json.loads(l.strip()[23:-1]) 44 | game = {**game, **data['item']['stats']} 45 | 46 | all_dicts.append(game) 47 | json.dump(all_dicts, open('all_dicts.json', 'w')) 48 | time.sleep(1) 49 | 50 | except: 51 | all_dicts.append(game) 52 | json.dump(all_dicts, open('all_dicts.json', 'w')) 53 | time.sleep(1) 54 | 55 | df2 = pd.DataFrame(all_dicts) 56 | 57 | match = [] 58 | for t in df2['Title']: 59 | for o in df['Title']: 60 | if o.startswith(t): 61 | match.append(o) 62 | break 63 | 64 | df2['Title'] = match 65 | df.merge(df2, on=('Title')).to_csv('game_data_with_bgg.csv', index=False) 66 | -------------------------------------------------------------------------------- /BoardGameCapital-selenium.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver # powers the browser interaction 2 | from selenium.webdriver.support.ui import Select # selects menu options 3 | from bs4 import BeautifulSoup # to parse HTML 4 | import csv # to write CSV 5 | import pandas as pd # to see CSV 6 | import time 7 | import os 8 | import random 9 | import requests 10 | import time as time_lib 11 | 12 | driver = webdriver.Chrome() 13 | next_page = "http://www.boardgamecapital.com/board-game-rules.htm" 14 | driver.get(next_page) 15 | 16 | soup = BeautifulSoup(driver.page_source, 'html5lib') 17 | game_cells = soup.find('tbody').find('tbody').find_all('td')[:-1] 18 | 19 | game_dict = {} 20 | 21 | for g in game_cells: 22 | game_dict[g.text] = {} 23 | game_dict[g.text]['link'] = 'http://www.boardgamecapital.com/' + \ 24 | g.find('a')['href'] 25 | 26 | for k in game_dict.keys(): 27 | print(k) 28 | driver.get(game_dict[k]['link']) 29 | 30 | soup = BeautifulSoup(driver.page_source, 'html5lib') 31 | 32 | gstats1 = [x.split(':') for x in soup.find( 33 | 'div', {'class': 'gstats1'}).text.split('\n')] 34 | price = gstats1[0][1].strip()[1:] 35 | time = gstats1[1][1].strip() 36 | 37 | gstats2 = [x.split(':') for x in soup.find( 38 | 'div', {'class': 'gstats2'}).text.split('\n')] 39 | age = gstats2[0][1].strip() 40 | players = gstats2[1][1].strip() 41 | 42 | text = soup.find('div', {'class', 'mainbody'}).text 43 | 44 | pdf_links = [ 45 | a for a in soup.find( 46 | 'div', { 47 | 'class', 'mainbody'}).find_all('a') if 'Game Rules' in a.text] 48 | 49 | paths = [] 50 | for url in pdf_links: 51 | path = 'pdfs/{}.pdf'.format(url.text) 52 | with open(path, 'wb') as f: 53 | f.write(requests.get(url['href']).content) 54 | 55 | paths.append(path) 56 | 57 | paths = ';'.join(paths) 58 | 59 | game_dict[k]['price'] = price 60 | game_dict[k]['time'] = time 61 | game_dict[k]['age'] = age 62 | game_dict[k]['players'] = players 63 | game_dict[k]['paths'] = paths 64 | game_dict[k]['web_text'] = text 65 | 66 | time_lib.sleep(1) 67 | -------------------------------------------------------------------------------- /IMSDB-BS.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup, NavigableString, Tag 3 | import time 4 | import urllib 5 | import pickle 6 | 7 | res = requests.get('http://www.imsdb.com/all%20scripts/').text 8 | 9 | soup = BeautifulSoup(res, 'html5lib') 10 | 11 | movies = soup.find_all('td', {'valign': 'top'})[2].find_all('p') 12 | 13 | base_url = 'http://www.imsdb.com' 14 | movie_urls = [ 15 | base_url + 16 | urllib.parse.quote( 17 | m.find('a')['href']) for m in movies] 18 | 19 | all_meta = [] 20 | # all_meta = pickle.load(open('meta_dicts.pkl', 'rb')) 21 | for i, url in enumerate(movie_urls[:3]): 22 | print(i) 23 | res = requests.get(url).text 24 | soup = BeautifulSoup(res, 'html5lib') 25 | 26 | script_details = soup.find('table', {'class': 'script-details'}) 27 | 28 | title = script_details.find('h1').text.strip() 29 | 30 | split_details = script_details.find_all('td')[2] 31 | 32 | meta_data = {'title': title} 33 | for t in split_details.find_all('b'): 34 | 35 | sibling_data = '' 36 | for s in t.next_siblings: 37 | if isinstance(s, NavigableString): 38 | if len(str(s).strip()) > 1: 39 | sibling_data += str(s).strip() 40 | break 41 | elif isinstance(s, Tag): 42 | try: 43 | if s.name == 'a': 44 | sibling_data += s.text + ';' 45 | except: 46 | pass 47 | 48 | if s.name == 'b': 49 | break 50 | 51 | meta_data[t.text] = sibling_data 52 | 53 | all_meta.append(meta_data) 54 | 55 | if "Read" in script_details.find_all('a')[-1].text: 56 | 57 | script_link = base_url + \ 58 | urllib.parse.quote(script_details.find_all('a')[-1]['href']) 59 | 60 | script_path = "scripts/" + title + '.html' 61 | with open(script_path, 'w') as f: 62 | f.write(requests.get(script_link).text) 63 | 64 | else: 65 | script_path = "NA" 66 | 67 | meta_data['script_path'] = script_path 68 | 69 | pickle.dump(all_meta, open('meta_dicts.pkl', 'wb')) 70 | 71 | time.sleep(1) 72 | -------------------------------------------------------------------------------- /NHSTrustsInfo-BS.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | import csv 5 | 6 | 7 | trust_url = 'https://www.nhs.uk/ServiceDirectories/Pages/NHSTrustListing.aspx' 8 | res = requests.get(trust_url) 9 | soup = BeautifulSoup(res.text, 'lxml') 10 | 11 | all_trusts = [x for x in soup('a') if x['href'].startswith('/Services/Trusts/Overview/DefaultView.aspx?id=')] 12 | 13 | all_items = [] 14 | for t in all_trusts: 15 | trust_name = t.text 16 | print(trust_name) 17 | trust_site = 'https://www.nhs.uk' + t['href'].replace('Overview', 'HospitalsAndClinics') 18 | res = requests.get(trust_site) 19 | soup = BeautifulSoup(res.text, 'lxml') 20 | items = [x for x in soup.find_all('div', {'class': 'panel-content'}) if 'Address' in str(x)] 21 | for i in items: 22 | item_name = i.find('h3') 23 | if item_name: 24 | item_name = item_name.text 25 | else: 26 | continue 27 | 28 | if not i.find('a'): 29 | continue 30 | 31 | if i.find('a')['href'].startswith('/Services'): 32 | url = 'https://www.nhs.uk' + i.find('a')['href'] 33 | service_type = i.find('a')['href'].split('/')[2].title() 34 | else: 35 | url = i.find('a')['href'] 36 | service_type = 'Other' 37 | 38 | properties = [x.text for x in i.find('dl').find_all('dt')] 39 | values = [BeautifulSoup(str(x).replace('
', ', '), 'lxml').text for x in i.find('dl').find_all('dd')] 40 | 41 | info_dict = {'Name': item_name, 42 | 'URL': url, 43 | 'Type': service_type, 44 | 'Trust Name': trust_name} 45 | for i,k in enumerate(properties): 46 | if k in ['PostCode', 'Ext', 'Website']: 47 | continue 48 | info_dict[k.strip(':')] = values[i] 49 | 50 | all_items.append(info_dict) 51 | 52 | time.sleep(2) 53 | 54 | 55 | keys = ['Name', 'Trust Name', 'Type', 'Tel', 'Address', 'Email', 'URL'] 56 | with open('nhs_sites.csv', 'w', newline='') as output_file: 57 | dict_writer = csv.DictWriter(output_file, keys) 58 | dict_writer.writeheader() 59 | dict_writer.writerows(all_items) 60 | -------------------------------------------------------------------------------- /INOCAR-AJAX.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | from bs4 import BeautifulSoup 4 | import re 5 | import csv 6 | import time 7 | from random import randint 8 | import pickle 9 | import os.path 10 | 11 | id_dict = {"61": "San Lorenzo", "377": "Esmeraldes"} 12 | days = [str(x) for x in list(range(1, 32))] 13 | months = [str(x) for x in list(range(1, 13))] 14 | years = [str(x) for x in list(range(2003, 2016))] 15 | 16 | if os.path.isfile("already_scraped.pkl"): 17 | already_scraped = pickle.load(open("already_scraped.pkl", "rb")) 18 | else: 19 | already_scraped = [] 20 | 21 | for l in id_dict.keys(): 22 | for y in years: 23 | for m in months: 24 | for d in days: 25 | date = d + "/" + m + "/" + y 26 | if (l, date) not in already_scraped: 27 | payload = { 28 | "id_puerto": l, 29 | "dias": d, 30 | "mes": m, 31 | "anio": y, 32 | "task": "generate", 33 | "tipocon": "form_", 34 | "Submit": "Ver", 35 | } 36 | 37 | r = requests.post( 38 | url='http://www.inocar.mil.ec/mareas/consulta.php', 39 | data=payload 40 | ) 41 | 42 | soup = BeautifulSoup(r.text, "lxml") 43 | 44 | r1 = soup.findAll("tr", {"class": "row_1"})[2:4] 45 | r2 = soup.findAll("tr", {"class": "row_2"})[2:4] 46 | rows = [tuple(r1[0].get_text().split('\n')), 47 | tuple(r2[0].get_text().split('\n')), 48 | tuple(r1[1].get_text().split('\n')), 49 | tuple(r2[1].get_text().split('\n'))] 50 | 51 | with open('data.csv', 'a') as f: 52 | a = csv.writer(f) 53 | for r in rows: 54 | row = (id_dict[l], date) + r 55 | a.writerow(row) 56 | 57 | already_scraped.append((l, date)) 58 | pickle.dump( 59 | already_scraped, open( 60 | "already_scraped.pkl", "wb")) 61 | time.sleep(randint(1, 3)) 62 | -------------------------------------------------------------------------------- /GoogleGeoLatLong-API.py: -------------------------------------------------------------------------------- 1 | import json 2 | from urllib.request import Request, urlopen 3 | import time 4 | import csv 5 | 6 | 7 | def getJson(lat, longi): 8 | url = 'http://maps.googleapis.com/maps/api/geocode/json?latlng=%s,%s&sensor=true' % \ 9 | (lat, longi) 10 | 11 | req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) 12 | 13 | response = urlopen(req).read().decode('utf-8') 14 | responseJson = json.loads(response)['results'] 15 | 16 | return responseJson 17 | 18 | latlong = [(18.6, - 19 | 100.566667), (19.6, - 20 | 100.566667), (19.6, - 21 | 101.566667), (17.6, - 22 | 100.566667), (27.121381, - 23 | 107.200644), (37.586630, - 24 | 123.233372), (25.267348, - 25 | 120.087235), (19.6, - 26 | 96.566667), (17.6, - 27 | 98.566667), (37.882042, - 28 | 122.277562)] 29 | 30 | municps = [] 31 | for coord in latlong: 32 | switch = 0 33 | info = getJson(coord[0], coord[1]) 34 | # municps.append(info.get("results")[1].get("address_components")[0].get("long_name")) 35 | # #if certain data is there 36 | for result in info: # to avoid errors if incorrect data 37 | for address_component in result['address_components']: 38 | if address_component['types'] == [ 39 | "administrative_area_level_2", "political"]: 40 | municps.append(address_component['long_name']) 41 | switch = 1 42 | break 43 | break 44 | 45 | if switch == 1: 46 | continue 47 | else: 48 | municps.append("None") 49 | 50 | time.sleep(.11) 51 | 52 | 53 | latlongname = list(zip(latlong, municps)) 54 | 55 | with open('data.csv', 'w') as out: 56 | csv_out = csv.writer(out) 57 | csv_out.writerow(['lat-long', 'name']) 58 | for row in latlongname: 59 | csv_out.writerow(row) 60 | -------------------------------------------------------------------------------- /LARRP-BS.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup # to parse HTML 2 | import csv # to write CSV 3 | import pandas as pd # to see CSV 4 | import time 5 | import os 6 | import random 7 | import requests 8 | 9 | 10 | def dl_pages(base_url, books, pres): 11 | os.mkdir(pres) 12 | for i1, b in enumerate(books): 13 | next_page = b 14 | res = requests.get(next_page).text 15 | soup = BeautifulSoup(res, 'html5lib') 16 | book_title = soup.find('h3').text 17 | 18 | os.mkdir(pres + '/' + book_title + '-' + str(i1)) 19 | 20 | try: 21 | for i in range(1, 10000): 22 | res = requests.get(next_page).text 23 | 24 | soup = BeautifulSoup(res, 'html5lib') 25 | 26 | if 'Discurso al proclamarse su candidatura' in book_title: 27 | next_page = base_url + \ 28 | soup.find('center').find_all('a')[1]['href'] 29 | else: 30 | next_page = base_url + \ 31 | soup.find('center').find_all('a')[2]['href'] 32 | 33 | tif_link = base_url + \ 34 | [x['href'] for x in soup.find_all('a') if 'tif' in x['href']][0] 35 | 36 | res = requests.get(tif_link).content 37 | 38 | with open(pres + '/' + book_title + '-' + str(i1) + '/page-' + str(i) + '.tif', 'wb') as f: 39 | f.write(res) 40 | 41 | time.sleep(1) 42 | except: 43 | continue 44 | 45 | 46 | books = [ 47 | 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/180002t.html', 48 | 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/190117t.html', 49 | 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/200253t.html', 50 | 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/210286t.html', 51 | 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/170347.html'] 52 | 53 | base_url = 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/yrigoyen/' 54 | 55 | dl_pages(base_url, books, 'yrigoyen') 56 | 57 | 58 | res = requests.get( 59 | 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/peron/index.html').text 60 | 61 | soup = BeautifulSoup(res, 'html5lib') 62 | 63 | books = [] 64 | base_url = 'http://lanic.utexas.edu/larrp/pm/sample2/argentin/peron/' 65 | for li in soup.find('ul').find_all('li'): 66 | link = [x for x in li.find_all('a') if 'idx' not in x['href']][0] 67 | 68 | if not link.text.strip().startswith('I'): 69 | books.append(base_url + link['href']) 70 | 71 | 72 | dl_pages(base_url, books, 'peron') 73 | -------------------------------------------------------------------------------- /Wiktionary-API.py: -------------------------------------------------------------------------------- 1 | '''This script scrapes wiktionary to get MHG lemmas of NHG lemmas.''' 2 | 3 | from bs4 import BeautifulSoup 4 | from urllib.request import Request, urlopen 5 | import time 6 | from string import punctuation 7 | import urllib.parse 8 | import treetaggerwrapper 9 | import json 10 | import time 11 | from random import randint 12 | import os 13 | import pyprind 14 | 15 | 16 | # get words from freq list and translations 17 | with open("top10000.txt", "r") as f: 18 | words = f.read().split() 19 | 20 | with open("NHG.txt", "r") as f: 21 | more_words = f.read().split() 22 | 23 | all_words = set(words + more_words) 24 | 25 | # turn words to set of lemmas 26 | tagger = treetaggerwrapper.TreeTagger(TAGLANG='de') 27 | 28 | lemmas = [] 29 | for w in all_words: 30 | lemm = tagger.tag_text(w)[0].split("\t")[-1] 31 | lemmas.append(lemm) 32 | 33 | lemmas = set(lemmas) 34 | 35 | # start scraping here 36 | base = "https://de.wiktionary.org/w/api.php?format=xml&action=query&titles=" 37 | branch = "&rvprop=content&prop=revisions&redirects=1" 38 | 39 | if os.path.isfile("cognate_dict.json"): 40 | cognate_dict = json.load(open("cognate_dict.json", "r")) 41 | else: 42 | cognate_dict = {} 43 | 44 | bar = pyprind.ProgBar(len(lemmas), monitor=True, bar_char="#") 45 | for w in lemmas: 46 | 47 | if w not in cognate_dict: 48 | 49 | # for UTF-8 URL parsing 50 | url = base + w + branch 51 | url_word = urllib.parse.quote(w) 52 | url = base + url_word + branch 53 | 54 | html = urlopen(url) 55 | bsObj = BeautifulSoup(html.read(), "lxml") 56 | text = bsObj.get_text() 57 | 58 | if "mittelhochdeutsch" in text: 59 | ind = text.index("mittelhochdeutsch") 60 | cognates = text[ind:].split("''") 61 | 62 | if len(cognates) > 1: 63 | cognates = cognates[1].split() 64 | for i, c in enumerate(cognates): 65 | if "|" in c: 66 | cognates[i] = c.split("|")[-1] 67 | 68 | for char in punctuation: 69 | cognates = [c.replace(char, "") for c in cognates] 70 | 71 | cognates = [c for c in cognates if len(c) > 0 and c[ 72 | 0].isalpha()] 73 | 74 | cognate_dict[w] = cognates 75 | 76 | with open("cognate_dict.json", "w") as f: 77 | json.dump(cognate_dict, f) 78 | 79 | time.sleep(randint(1, 3)) 80 | 81 | else: 82 | cognate_dict[w] = None 83 | 84 | with open("cognate_dict.json", "w") as f: 85 | json.dump(cognate_dict, f) 86 | 87 | else: 88 | 89 | cognate_dict[w] = None 90 | 91 | with open("cognate_dict.json", "w") as f: 92 | json.dump(cognate_dict, f) 93 | 94 | bar.update() 95 | 96 | print("Done!") 97 | -------------------------------------------------------------------------------- /ADA-ERP-BS.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | import time 5 | import pickle 6 | import csv 7 | 8 | 9 | def get_pages(soup): 10 | ''' 11 | gets links to any subsequent pages 12 | ''' 13 | base = 'https://professional.diabetes.org' 14 | try: 15 | page_links = soup.find('ul', {'class': 'pagination'}).find_all('a') 16 | links = [base + a['href'] for a in page_links] 17 | return set(links) 18 | except: 19 | return None 20 | 21 | 22 | def get_org_dicts(soup): 23 | ''' 24 | turn any listed organizations on page to dictionaries 25 | ''' 26 | 27 | orgs = soup.find_all('div', {'class': 'col col-sm-4'}) 28 | 29 | org_dicts = [] 30 | 31 | for o in orgs: 32 | meta = o.find_all('div') 33 | org_dict = {} 34 | 35 | # up to colon is key after is value 36 | pattern = re.compile('(.*?):(.*)') 37 | for m in meta: 38 | try: 39 | groups = re.search(pattern, m.text).groups() 40 | title = groups[0].strip() 41 | value = groups[1].strip() 42 | org_dict[title] = value 43 | except: 44 | pass 45 | 46 | org_dicts.append(org_dict) 47 | 48 | return org_dicts 49 | 50 | 51 | if __name__ == "__main__": 52 | # get list of states from sample URL 53 | init = 'https://professional.diabetes.org/erp_list?field_erp_state_value=NY' 54 | res = requests.get(init) 55 | soup = BeautifulSoup(res.text, 'html5lib') 56 | options = soup.find( 57 | 'select', {'id': 'edit-field-erp-state-value'}).find_all('option') 58 | states = [x['value'] for x in options] 59 | 60 | # start iteration through state URLS 61 | all_dicts = [] 62 | for s in states: 63 | print(s) 64 | state_link = 'https://professional.diabetes.org/erp_list?field_erp_state_value={}'.format( 65 | s) 66 | res = requests.get(state_link) 67 | soup = BeautifulSoup(res.text, 'html5lib') 68 | 69 | # get dicts 70 | all_dicts.extend(get_org_dicts(soup)) 71 | pickle.dump(all_dicts, open('all-dicts.pkl', 'wb')) 72 | 73 | # get extra pages 74 | pages = get_pages(soup) 75 | 76 | # cycle through subsequent pages 77 | if pages != None: 78 | for p in pages: 79 | res = requests.get(p) 80 | soup = BeautifulSoup(res.text, 'html5lib') 81 | all_dicts.extend(get_org_dicts(soup)) 82 | time.sleep(1) 83 | pickle.dump(all_dicts, open('all-dicts.pkl', 'wb')) 84 | time.sleep(1) 85 | 86 | # dump csv 87 | with open('erp.csv', 'w') as csvfile: 88 | fieldnames = list(all_dicts[0].keys()) 89 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 90 | writer.writeheader() 91 | writer.writerows(all_dicts) 92 | -------------------------------------------------------------------------------- /RioGrandeGames-selenium.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver # powers the browser interaction 2 | from selenium.webdriver.support.ui import Select # selects menu options 3 | from bs4 import BeautifulSoup # to parse HTML 4 | import csv # to write CSV 5 | import pandas as pd # to see CSV 6 | import time 7 | import os 8 | import random 9 | import requests 10 | import pickle 11 | 12 | 13 | driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true']) 14 | driver.get('http://riograndegames.com/search.html?category%5B%5D=5&category%5B%5D=10&category%5B%5D=14&category%5B%5D=1&category%5B%5D=2&category%5B%5D=12&category%5B%5D=3&category%5B%5D=6&category%5B%5D=8&category%5B%5D=9&category%5B%5D=4&category%5B%5D=13&category%5B%5D=22&category%5B%5D=16&category%5B%5D=11&category%5B%5D=7&category%5B%5D=17&category%5B%5D=18&category%5B%5D=15&language=0&min_players=0&length=0&min_age=0&term=') 15 | search_results = driver.find_element_by_css_selector( 16 | 'div#search_results.isotope').find_elements_by_css_selector('div.search_item.isotope-item') 17 | 18 | games_dicts = [] 19 | attributes = [ 20 | 'data-title', 21 | 'data-orig', 22 | 'data-length', 23 | 'data-date', 24 | 'data-age', 25 | 'data-players', 26 | 'data-msrp'] 27 | 28 | for s in search_results: 29 | game = {} 30 | for a in attributes: 31 | game[a] = s.get_attribute(a) 32 | 33 | game['page_link'] = s.find_element_by_css_selector( 34 | 'a').get_attribute('href') 35 | 36 | games_dicts.append(game) 37 | 38 | 39 | final_games_dicts = [] 40 | for g in games_dicts: 41 | print(g['data-title']) 42 | driver.get(g['page_link']) 43 | cats = driver.find_elements_by_css_selector('span.game_cat') 44 | cats = [c.text.replace(',', '') for c in cats] 45 | g['game_category'] = ';'.join(cats) 46 | 47 | # unfold and download 48 | driver.find_element_by_css_selector('span.button2').click() 49 | 50 | asset_links = driver.find_elements_by_css_selector('p.asset_list a') 51 | 52 | for a in asset_links: 53 | images = a.find_elements_by_css_selector("img") 54 | for i in images: 55 | if "rules" in i.get_attribute('title').lower(): 56 | download = a.get_attribute('href') 57 | session = requests.Session() 58 | cookies = driver.get_cookies() 59 | 60 | for cookie in cookies: 61 | session.cookies.set(cookie['name'], cookie['value']) 62 | response = session.get(download) 63 | 64 | dl_path = 'pdfs/' + g['data-title'] + '.pdf' 65 | 66 | with open(dl_path, 'wb') as f: 67 | f.write(response.content) 68 | 69 | g['pdf_path'] = dl_path 70 | final_games_dicts.append(g) 71 | pickle.dump(final_games_dicts, open('game_dicts.pkl', 'wb')) 72 | 73 | time.sleep(1) 74 | break 75 | break 76 | 77 | time.sleep(1) 78 | -------------------------------------------------------------------------------- /STNMFSNOAA-BS.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import os 4 | import time 5 | 6 | # send payload to get list of species 7 | payload = {'qwhocalled': 'monthly', 8 | 'qcommon': '', 9 | 'qreturn': 'Search', 10 | 'qselect': 'List Empty, Do a Search to Fill'} 11 | r = requests.get( 12 | 'https://www.st.nmfs.noaa.gov/pls/webpls/FT_HELP.SPECIES', 13 | params=payload) 14 | 15 | soup = BeautifulSoup(r.content, "lxml") 16 | species = [x.text for x in soup.findAll("option")] 17 | 18 | # iterate through species 19 | for sp in species: 20 | 21 | if not os.path.exists(sp.replace(",", "").replace( 22 | " ", "-").replace("/", "_")): # if need to restart script 23 | 24 | # make directory for species 25 | os.mkdir(sp.replace(",", "").replace(" ", "-").replace("/", "_")) 26 | 27 | # send payload to get different states and regions 28 | payload = {'qwhocalled': 'monthly', 29 | 'qcommon': '', 30 | 'qreturn': 'Return', 31 | 'qselect': sp} 32 | r = requests.get( 33 | 'https://www.st.nmfs.noaa.gov/pls/webpls/FT_HELP.SPECIES', 34 | params=payload) 35 | 36 | soup = BeautifulSoup(r.content, "lxml") 37 | states = [ 38 | x.text for x in soup.find( 39 | "select", { 40 | "name": "qstate"}).findAll("option")] 41 | 42 | # iterate through different regions and states 43 | for st in states: 44 | 45 | payload = {'qspecies': sp, 46 | 'qreturn': 'Species Locator', 47 | 'qyearfrom': '1990', 48 | 'qyearto': '2015', 49 | 'qmonth': 'YEAR BY MONTH', 50 | 'qstate': st, 51 | 'qoutput_type': 'TABLE'} 52 | r = requests.get( 53 | 'http://www.st.nmfs.noaa.gov/pls/webpls/MF_MONTHLY_LANDINGS.RESULTS', 54 | params=payload) 55 | 56 | # save html tables into folders 57 | with open(sp.replace(",", "").replace(" ", "-").replace("/", "_") + "/" + st + ".html", "w") as f: 58 | f.write(str(r.content)) 59 | 60 | # don't overload server 61 | time.sleep(.1) 62 | 63 | # get all species from main page 64 | os.mkdir('ALL-SPECIES-COMBINED') 65 | 66 | # iterate through different states and regions 67 | for st in states: 68 | 69 | payload = {'qspecies': 'ALL SPECIES COMBINED', 70 | 'qreturn': 'Species Locator', 71 | 'qyearfrom': '1990', 72 | 'qyearto': '2015', 73 | 'qmonth': 'YEAR BY MONTH', 74 | 'qstate': st, 75 | 'qoutput_type': 'TABLE'} 76 | 77 | r = requests.get( 78 | 'https://www.st.nmfs.noaa.gov/pls/webpls/MF_MONTHLY_LANDINGS.RESULTS', 79 | params=payload) 80 | 81 | with open('ALL-SPECIES-COMBINED' + "/" + st + ".html", "w") as f: 82 | f.write(str(r.content)) 83 | -------------------------------------------------------------------------------- /BAAD-BS.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup # to parse HTML 2 | import csv # to write CSV 3 | import pandas as pd # to see CSV 4 | import time 5 | import os 6 | import random 7 | import requests 8 | 9 | next_page = 'http://www.start.umd.edu/baad/database' 10 | base_url = 'http://www.start.umd.edu' 11 | 12 | all_rows = [] 13 | all_rows.append(['ID', 14 | 'Group Name', 15 | 'Country', 16 | 'Lethality', 17 | 'Number of Allies', 18 | 'Number of Rivals', 19 | 'Founded', 20 | 'Fatalities', 21 | 'Fatality Years', 22 | 'Ideologies', 23 | 'Strength', 24 | 'Territorial Control', 25 | 'Funding through Drug Trafficking', 26 | 'Record Year']) 27 | 28 | for i in range(1, 6): 29 | res = requests.get(next_page).text 30 | 31 | soup = BeautifulSoup(res, 'html5lib') 32 | 33 | rows = soup.find('table', {'class', 'sticky-enabled'}).find_all('tr') 34 | rows = rows[1:] 35 | 36 | for r in rows: 37 | cells = r.find_all('td') 38 | cell_text = [x.text.strip() for x in cells] 39 | link = base_url + cells[0].find('a')['href'] 40 | 41 | res = requests.get(link).text 42 | soup = BeautifulSoup(res, 'html5lib') 43 | 44 | year_bullets = soup.find('div', {'class': 'item-list'}).find_all('li') 45 | year_urls = [(base_url + x.find('a')['href'], 46 | x.find('a').text.strip()) for x in year_bullets] 47 | for u in year_urls: 48 | record_year = u[1] 49 | res = requests.get(u[0]).text 50 | soup = BeautifulSoup(res, 'html5lib') 51 | 52 | founded = soup.find( 53 | 'div', {'class', 'quick-view-founded'}).text.split(':')[-1].strip() 54 | fatalities, fatality_years = soup.find( 55 | 'div', {'class', 'quick-view-lethality'}).text.split(':')[-1].strip().split(' ', maxsplit=1) 56 | ideology = soup.find( 57 | 'div', {'class', 'quick-view-ideology'}).text.split(':')[-1].strip() 58 | strength = soup.find( 59 | 'div', {'class', 'quick-view-strength'}).text.split(':')[-1].strip() 60 | terrcnt = soup.find( 61 | 'div', {'class', 'quick-view-terrcnt'}).text.split(':')[-1].strip() 62 | drugs = soup.find( 63 | 'div', {'class', 'quick-view-drug-funding'}).text.split(':')[-1].strip() 64 | 65 | data_row = [ 66 | cell_text[0] + '-' + record_year] + cell_text + [ 67 | founded, 68 | fatalities, 69 | fatality_years, 70 | ideology, 71 | strength, 72 | terrcnt, 73 | drugs, 74 | record_year] 75 | print(data_row) 76 | all_rows.append(data_row) 77 | 78 | time.sleep(1) 79 | 80 | time.sleep(1) 81 | 82 | next_page = 'http://www.start.umd.edu/baad/database?page={}'.format(str(i)) 83 | time.sleep(1) 84 | 85 | 86 | with open("baad.csv", "w") as f: 87 | csv_w = csv.writer(f) 88 | csv_w.writerows(all_rows) 89 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # web-scrapers 2 | 3 | Various web scrapers for research and fun: 4 | 5 | - [Board Game Capital](http://www.boardgamecapital.com/board-game-rules.htm) 6 | - [CTS Net](https://www.ctsnet.org/) 7 | - [Minutes of the Federal Reserve Board of Governors discount rate](https://www.federalreserve.gov/monetarypolicy/discountrate.htm) 8 | - [Doximity](https://www.doximity.com/) 9 | - [Energy - The Automated Register of Implemented Actions](https://www.energy.gov/eere/downloads/automated-register-implemented-actions) 10 | - [Lucid Chart](https://www.lucidchart.com/) 11 | - [GLERL NOAA](https://www.glerl.noaa.gov//metdata/status/status_archive/) 12 | - [American Historical Association](http://careers.historians.org/jobs/) 13 | - [Grimm Fairy Tales](https://www.cs.cmu.edu/~spok/grimmtmp/) 14 | - [Perrault Fairy Tales](http://www.pitt.edu/~dash/perrault.html) 15 | - [IMSDB](http://www.imsdb.com/all%20scripts/) 16 | - [Glass Door API](https://www.glassdoor.com/index.htm) 17 | - [Crunch Base API](https://data.crunchbase.com/docs) 18 | - [Google Directions](https://www.google.com/maps/dir/) 19 | - [INOCAR](http://www.inocar.mil.ec/web/index.php) 20 | - [Kalamazoo](http://scholarworks.wmich.edu/) 21 | - [Kiva API](https://www.kiva.org/) 22 | - [Google Geocoding API](https://developers.google.com/maps/documentation/geocoding/start) 23 | - [Google Search](https://www.google.com/) 24 | - [GLO Records](https://glorecords.blm.gov/default.aspx) 25 | - [MRIP NOAA](http://www.st.nmfs.noaa.gov/recreational-fisheries/MRIP/mrip-project) 26 | - [Web of Science](http://ipscience-help.thomsonreuters.com/LAMRService/WebServiceOperationsGroup/requestAPIWoS.html) 27 | - [ST NMFS NOAA](https://www.st.nmfs.noaa.gov/) 28 | - [NCDC NOAA](https://www.ncdc.noaa.gov/cdr/atmospheric/total-solar-irradiance) 29 | - [Open Secrets](https://www.opensecrets.org/resources/create/apis.php) 30 | - [Resident Advisor](https://www.residentadvisor.net/reviews.aspx?format=single) 31 | - [Rate My Professors](http://www.ratemyprofessors.com/) 32 | - [LARRP](http://lanic.utexas.edu/larrp/pm/sample2/) 33 | - [Wiktionary](https://de.wiktionary.org/) 34 | - [Wikipedia Revision History API](https://www.mediawiki.org/wiki/API:Revisions) 35 | - [Big, Allied and Dangerous BAAD](http://www.start.umd.edu/baad/database) 36 | - [Rio Grande Games](http://riograndegames.com/) 37 | - [Bayerische Staatsbibliothek](https://opacplus.bsb-muenchen.de/) 38 | - [DataSF Fire Incidents API](https://data.sfgov.org/Public-Safety/Fire-Incidents/wr8u-xric) 39 | - [Google Geocoding API Searches](https://developers.google.com/maps/documentation/geocoding/start) 40 | - [Board Game Geek](https://boardgamegeek.com/) 41 | - [Data Mart Basic Skills](http://datamart.cccco.edu/Outcomes/BasicSkills_Cohort_Tracker.aspx) 42 | - [Public Access to Court Electronic Records (PACER)](https://www.pacer.gov/) 43 | - [SF Planning Commission Minutes](http://default.sfplanning.org/meetingarchive/planning_dept/sf-planning.org/index.aspx-page=1000.html) 44 | - [American Diabetes Association ERP Resources](https://professional.diabetes.org/erp_list?field_erp_state_value=NY) 45 | - [National Stolen Art File](https://www.fbi.gov/investigate/violent-crime/art-theft/national-stolen-art-file) 46 | - [NHS Trusts](https://www.nhs.uk/ServiceDirectories/Pages/NHSTrustListing.aspx) 47 | -------------------------------------------------------------------------------- /BSBDigitaleSammlungen-API.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver # powers the browser interaction 2 | from selenium.webdriver.support.ui import Select # selects menu options 3 | from bs4 import BeautifulSoup # to parse HTML 4 | import csv # to write CSV 5 | import pandas as pd # to see CSV 6 | import time 7 | import os 8 | import random 9 | import requests 10 | import re 11 | import pickle 12 | import numpy as np 13 | 14 | 15 | # PART 1 16 | # first collect bsb ids from search of years 700-1400 17 | driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true']) 18 | 19 | driver.maximize_window() 20 | 21 | driver.get("https://opacplus.bsb-muenchen.de/metaopac/start.do") 22 | driver.find_element_by_css_selector( 23 | 'input#searchRestrictionValue1_2.form-control').send_keys('700') 24 | driver.find_element_by_css_selector( 25 | 'input#searchRestrictionValue2_2.form-control').send_keys('1400') 26 | driver.find_element_by_css_selector( 27 | 'input#submitSearch.btn.btn-default.dbuttonb').click() 28 | driver.find_element_by_css_selector( 29 | '#availableFacets > ul > li:nth-child(4) > ul > li:nth-child(5) > a > span.hidden-xs').click() 30 | 31 | time.sleep(5) 32 | 33 | print(driver.find_element_by_css_selector( 34 | '#speed_result_list_100 > div > div.nav.nav-tabs.box-header.navigation > div.col-xs-9.col-md-5 > h2').text) 35 | 36 | bsbs = [] 37 | pattern = r'bsb[0-9]+' 38 | 39 | for i in range(2000): 40 | 41 | print(i) 42 | 43 | soup = BeautifulSoup(driver.page_source, 'html5lib') 44 | 45 | rows = soup.find_all('td', {'class': 'resultscell'}) 46 | 47 | for r in rows: 48 | links = r.find_all('a') 49 | for l in links: 50 | if re.search(pattern, l['href']): 51 | bsbs.append(re.search(pattern, l['href']).group()) 52 | 53 | pickle.dump(bsbs, open('bsbs.pkl', 'wb')) 54 | 55 | driver.find_element_by_css_selector( 56 | '#speed_result_list_100 > div > div.nav.nav-tabs.box-header.navigation > div.hidden-xs.hidden-sm.col-xs-7.col-md-7.pull-right.pagination > div > ul > li:nth-child(8) > a').click() 57 | time.sleep(5) 58 | 59 | 60 | # PART 2 61 | # now read in list of bsb ids and collect API data 62 | 63 | 64 | def get_dimensions(res): 65 | 66 | width = [] 67 | height = [] 68 | for p in res['sequences'][0]['canvases']: 69 | try: 70 | scale = p['service']['physicalScale'] 71 | width.append(p['width'] * scale) 72 | height.append(p['height'] * scale) 73 | except: 74 | pass 75 | 76 | return (np.mean(height), np.mean(width)) 77 | 78 | bsbs = pickle.load(open('bsbs.pkl', 'rb')) 79 | data_dicts = [] 80 | 81 | for bsb in bsbs: 82 | print(bsb) 83 | 84 | try: 85 | res = requests.get( 86 | 'https://api.digitale-sammlungen.de/iiif/presentation/v2/{}/manifest'.format(bsb)).json() 87 | hs_dict = {} 88 | hs_dict['Thumbnail'] = res['thumbnail']['@id'] 89 | hs_dict['Label'] = res['label'] 90 | 91 | for m in res['metadata']: 92 | key = m['label'][1]['@value'] 93 | value = m['value'] 94 | 95 | if isinstance(value, list): 96 | value = value[-1]['@value'] 97 | 98 | hs_dict[key.strip()] = value.strip() 99 | 100 | hs_dict['Height'], hs_dict['Width'] = get_dimensions(res) 101 | 102 | data_dicts.append(hs_dict) 103 | pickle.dump(data_dicts, open('data_dicts.pkl', 'wb')) 104 | 105 | except: 106 | pass 107 | 108 | time.sleep(3) 109 | -------------------------------------------------------------------------------- /AHA-selenium.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver # powers the browser interaction 2 | from selenium.webdriver.support.ui import Select # selects menu options 3 | from bs4 import BeautifulSoup # to parse HTML 4 | import csv # to write CSV 5 | import pandas as pd # to see CSV 6 | import time 7 | import os 8 | import random 9 | import requests 10 | 11 | driver = webdriver.Chrome() 12 | driver.get("http://careers.historians.org/jobs/?page=1") 13 | 14 | base_url = 'http://careers.historians.org' 15 | all_rows = [] 16 | pages = ["http://careers.historians.org/jobs/?page=1", 17 | "http://careers.historians.org/jobs/?page=2"] 18 | 19 | for p in pages: 20 | driver.get(p) 21 | soup = BeautifulSoup(driver.page_source, 'html5lib') 22 | 23 | rows = soup.find_all('div', {'class': 'bti-ui-job-detail-container'}) 24 | for r in rows: 25 | title = r.find('a').text.strip() 26 | link = base_url + r.find('a')['href'] 27 | employer = r.find( 28 | 'div', { 29 | 'class': 'bti-ui-job-result-detail-employer'}).text.strip() 30 | location = r.find( 31 | 'div', { 32 | 'class': 'bti-ui-job-result-detail-location'}).text.strip() 33 | date_posted = r.find( 34 | 'div', { 35 | 'class': 'bti-ui-job-result-detail-age'}).text.strip() 36 | 37 | driver.get(link) 38 | 39 | soup = BeautifulSoup(driver.page_source, 'html5lib') 40 | 41 | try: 42 | job_description = soup.find( 43 | 'div', {'class': 'bti-jd-description'}).text.strip() 44 | 45 | details = soup.find('div', {'class': 'bti-jd-details-container'}) 46 | 47 | details_titles = [ 48 | x.text.replace( 49 | ':', '').lower().strip() for x in details.find_all( 50 | 'div', { 51 | 'class': 'bti-jd-detail-title'})] 52 | details_text = [ 53 | x.text.strip() for x in details.find_all( 54 | 'div', { 55 | 'class': 'bti-jd-detail-text'})] 56 | 57 | details_dict = {} 58 | 59 | for i in range(len(details_titles)): 60 | t = details_titles[i] 61 | if 'categories' in t: 62 | t = 'category' 63 | elif 'required' in t: 64 | t = 'preferred education' 65 | details_dict[t] = details_text[i] 66 | 67 | details_dict['title'] = title 68 | details_dict['link'] = link 69 | details_dict['employer'] = employer 70 | details_dict['location'] = location 71 | details_dict['date_posted'] = date_posted 72 | details_dict['job_description'] = job_description 73 | 74 | try: 75 | details_dict['employer_about'] = soup.find( 76 | 'div', {'class': 'bti-jd-employer-info'}).text.strip() 77 | except: 78 | details_dict['employer_about'] = '' 79 | 80 | all_rows.append(details_dict) 81 | 82 | except: 83 | pass 84 | 85 | time.sleep(1) 86 | 87 | header = ["title", 88 | "employer", 89 | "location", 90 | "posted", 91 | "date_posted", 92 | "primary field", 93 | "category", 94 | "preferred education", 95 | "salary", 96 | "type", 97 | "employment type", 98 | "job_description", 99 | "employer_about", 100 | "link" 101 | ] 102 | 103 | 104 | with open('AHA-data.csv', 'w') as f: 105 | w = csv.DictWriter(f, header) 106 | w.writeheader() 107 | w.writerows(all_rows) 108 | -------------------------------------------------------------------------------- /ResidentAdvisor-selenium.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver # powers the browser interaction 2 | from selenium.webdriver.support.ui import Select # selects menu options 3 | from bs4 import BeautifulSoup # to parse HTML 4 | import csv # to write CSV 5 | import pandas as pd # to see CSV 6 | import time 7 | import os 8 | import random 9 | 10 | driver = webdriver.PhantomJS() 11 | next_page = "https://www.residentadvisor.net/reviews.aspx?format=single" 12 | 13 | with open("resident-adv.csv", "a") as f: 14 | csv_w_interv = csv.writer(f) 15 | csv_w_interv.writerow(["title", 16 | "artist", 17 | "single", 18 | "label", 19 | "record", 20 | "style", 21 | "reviewed_date", 22 | "release_date", 23 | "comments", 24 | "rating", 25 | "description", 26 | "URL"]) 27 | 28 | 29 | for i in range(10000): 30 | 31 | driver.get(next_page) 32 | 33 | soup = BeautifulSoup(driver.page_source, "html5lib") 34 | 35 | try: 36 | next_page = "https://www.residentadvisor.net/" + \ 37 | soup.find("li", {"class": "but arrow-left bbox"}).find("a")['href'] 38 | except: 39 | next_page = "" 40 | 41 | singles = soup.find( 42 | "div", { 43 | "id": "reviews"}).find_all( 44 | "article", { 45 | "class": "highlight-top"}) 46 | 47 | review_links = [ 48 | 'https://www.residentadvisor.net' + 49 | x.find("a")['href'] for x in singles] 50 | 51 | if i == 0: 52 | review_links = review_links[25:] 53 | 54 | for l in review_links: 55 | driver.get(l) 56 | 57 | soup = BeautifulSoup(driver.page_source, 'html5lib') 58 | 59 | title = soup.find("div", {"id": "sectionHead"}).find("h1").text.strip() 60 | 61 | try: 62 | artist = title.split("-")[0].strip() 63 | 64 | single = title.split("-")[1].strip() 65 | except: 66 | artist = '' 67 | single = '' 68 | 69 | print(title) 70 | 71 | rating = soup.find("span", {"class": "rating"}).text.split("/")[0] 72 | reviewed_date = soup.find("span", {"itemprop": "dtreviewed"})[ 73 | 'datetime'].strip() 74 | 75 | meta_list = soup.find("ul", {"class": "clearfix"}).find_all("li") 76 | 77 | style = meta_list[2].text.split('\n')[4] 78 | label = str(meta_list[0]).split( 79 | '
')[0].split('">')[-1].split('')[-1].split(" 0: 108 | 109 | with open("IT-cardi.csv", "a") as f: 110 | csv_w_interv = csv.writer(f) 111 | csv_w_interv.writerow( 112 | [name, hospital, phone, interests, fields, city, country, street, l]) 113 | 114 | time.sleep(random.randint(1, 3)) 115 | time.sleep(random.randint(1, 3)) 116 | -------------------------------------------------------------------------------- /Doximity-selenium.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver # powers the browser interaction 2 | from selenium.webdriver.support.ui import Select # selects menu options 3 | from bs4 import BeautifulSoup # to parse HTML 4 | import csv # to write CSV 5 | import pandas as pd # to see CSV 6 | import time 7 | import os 8 | import random 9 | 10 | 11 | header = [ 12 | 'Name', 13 | 'Title', 14 | 'Hospital', 15 | 'Phone', 16 | 'State', 17 | 'Tags', 18 | 'Summary', 19 | 'Skills', 20 | 'City', 21 | 'Address'] 22 | 23 | with open("cardi.csv", "a") as f: 24 | csv_w_electro = csv.writer(f) 25 | csv_w_electro.writerow(header) 26 | 27 | driver = webdriver.PhantomJS() 28 | next_page = "https://www.doximity.com/directory/md/specialty/thoracic-surgery?from_slug=pub%2Fmichael-peter-kaye-md" 29 | 30 | for i in range(1000): 31 | 32 | driver.get(next_page) 33 | 34 | try: 35 | next_page = BeautifulSoup( 36 | driver.page_source, "html5lib").find( 37 | "a", { 38 | "class": "next_page"})['href'] 39 | next_page = "https://www.doximity.com" + next_page 40 | except: 41 | next_page = "" 42 | 43 | links = [a.get_attribute( 44 | 'href') for a in driver.find_elements_by_css_selector("ul.list-4-col a")] 45 | links = random.sample(links, 15) 46 | 47 | for l in links: 48 | 49 | driver.get(l) 50 | soup = BeautifulSoup(driver.page_source, "html5lib") 51 | 52 | try: 53 | name = soup.find("span", {"id": "user_full_name"}).text.strip() 54 | print(name) 55 | except: 56 | name = "" 57 | 58 | try: 59 | title = soup.find("p", {"itemprop": "jobTitle"}).text.strip() 60 | except: 61 | title = "" 62 | 63 | try: 64 | city = soup.find( 65 | "span", { 66 | "itemprop": "addressLocality"}).text.strip() 67 | except: 68 | city = "" 69 | 70 | try: 71 | state = soup.find("span", 72 | {"itemprop": "addressRegion"}).text.strip() 73 | except: 74 | state = "" 75 | 76 | try: 77 | address = soup.find("div", {"class": "col-1-2"}).text.strip() 78 | except: 79 | address = "" 80 | 81 | try: 82 | hospital = soup.find("section", 83 | {"class": "section hospital-info"}).findAll("span", 84 | {"itemprop": "name"}) 85 | hospitals = '; '.join([x.text.strip() for x in hospital]) 86 | except: 87 | hospitals = "" 88 | 89 | try: 90 | phone = soup.find("span", {"itemprop": "telephone"}).text.strip() 91 | except: 92 | phone = "" 93 | 94 | try: 95 | summary = soup.find( 96 | "section", { 97 | "class": "section summary-info"}).find("ul").text.strip() 98 | except: 99 | summary = "" 100 | 101 | try: 102 | skills = soup.find( 103 | "div", { 104 | "class": "section skills-info"}).find("ul").text.strip() 105 | except: 106 | skills = "" 107 | 108 | try: 109 | tags = soup.find("div", {"class": "section"}).find( 110 | "p").text.strip() 111 | 112 | if len(phone) > 0: 113 | if "cardi" in tags.lower(): 114 | with open("cardi.csv", "a") as f: 115 | csv_w_electro = csv.writer(f) 116 | csv_w_electro.writerow( 117 | [name, title, hospitals, phone, state, tags, summary, skills, city, address]) 118 | 119 | except: 120 | pass 121 | 122 | time.sleep(random.randint(1, 3)) 123 | 124 | time.sleep(random.randint(1, 3)) 125 | -------------------------------------------------------------------------------- /RateMyProfessors-selenium.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver # powers the browser interaction 2 | from selenium.webdriver.support.ui import Select # selects menu options 3 | from selenium.webdriver.common.keys import Keys 4 | from bs4 import BeautifulSoup # to parse HTML 5 | import csv # to write CSV 6 | import pandas as pd # to see CSV 7 | import time 8 | import os 9 | import random 10 | 11 | 12 | header = ['Prof_Name', 13 | 'Title', 14 | 'School', 15 | 'Overall_Quality', 16 | 'Overall_Take_Again', 17 | 'Overall_Difficulty', 18 | 'Overall_Hot', 19 | 'Comment_Date', 20 | 'Rating_Type', 21 | 'Course', 22 | 'Quality', 23 | 'Difficulty', 24 | 'Credit', 25 | 'Attendance', 26 | 'Textbook', 27 | 'Take_Again', 28 | 'Grade', 29 | 'Comment', 30 | 'Helpful', 31 | 'Not_Helpful', 32 | 'URL'] 33 | 34 | with open("rmp.csv", "a") as f: 35 | csv_w = csv.writer(f) 36 | csv_w.writerow(header) 37 | 38 | base_url = 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=' 39 | 40 | driver = webdriver.PhantomJS() 41 | driver.get(base_url + str(random.randint(1, 500000))) 42 | driver.find_element_by_css_selector('a.btn.close-this').click() 43 | 44 | for i in range(500000): 45 | url = base_url + str(random.randint(1, 500000)) 46 | driver.get(url) 47 | 48 | try: 49 | soup = BeautifulSoup(driver.page_source, 'html5lib') 50 | comment_table = soup.find('table', {'class': 'tftable'}) 51 | comments = comment_table.find_all('tr')[1:] 52 | except: 53 | continue 54 | 55 | prof_name = ' '.join( 56 | soup.find( 57 | 'h1', { 58 | 'class': 'profname'}).text.strip().split()) 59 | print(prof_name) 60 | school = soup.find('a', {'class': 'school'}).text.strip() 61 | title = ' '.join( 62 | soup.find( 63 | 'div', { 64 | 'class': 'result-title'}).text.strip().split()).split(' are you')[0] 65 | 66 | overall = soup.find_all('div', {'class': 'grade'})[:3] 67 | o_quality, o_take_again, o_difficulty = [x.text.strip() for x in overall] 68 | o_hot = soup.find_all('div', {'class': 'grade'})[3].find('img')[ 69 | 'src'].split('/')[-1].split('.')[0] 70 | 71 | all_rows = [] 72 | for c in comments: 73 | try: 74 | date = c.find('div', {'class': 'date'}).text.strip() 75 | rating_type = c.find('span', {'class': 'rating-type'}).text.strip() 76 | course = c.find('span', {'class': 'name'}).text.strip() 77 | credit = c.find('span', {'class': 'credit'} 78 | ).text.strip().split(':')[1].strip() 79 | attendance = c.find( 80 | 'span', { 81 | 'class': 'attendance'}).text.strip().split(':')[1].strip() 82 | textbook = c.find( 83 | 'span', { 84 | 'class': 'textbook-used'}).text.strip().split(':')[1].strip() 85 | take_again = c.find( 86 | 'span', { 87 | 'class': 'would-take-again'}).text.strip().split(':')[1].strip() 88 | grade = c.find('span', {'class': 'grade'} 89 | ).text.strip().split(':')[1].strip() 90 | 91 | brkdown = c.find( 92 | 'div', { 93 | 'class': 'breakdown'}).find_all( 94 | 'div', { 95 | 'class': 'descriptor-container'}) 96 | quality, difficulty = [x.text.strip().split()[0] for x in brkdown] 97 | 98 | helpful = c.find('a', {'class': 'helpful'}).find( 99 | 'span', {'class': 'count'}).text.strip() 100 | not_helpful = c.find( 101 | 'a', { 102 | 'class': 'nothelpful'}).find( 103 | 'span', { 104 | 'class': 'count'}).text.strip() 105 | 106 | comment = c.find('p', {'class': 'commentsParagraph'}).text 107 | 108 | row = [prof_name, 109 | title, 110 | school, 111 | o_quality, 112 | o_take_again, 113 | o_difficulty, 114 | o_hot, 115 | date, 116 | rating_type, 117 | course, 118 | quality, 119 | difficulty, 120 | credit, 121 | attendance, 122 | textbook, 123 | take_again, 124 | grade, 125 | comment, 126 | helpful, 127 | not_helpful, 128 | url] 129 | 130 | all_rows.append(row) 131 | 132 | except: 133 | pass 134 | 135 | with open("rmp.csv", "a") as f: 136 | csv_w = csv.writer(f) 137 | csv_w.writerows(all_rows) 138 | 139 | time.sleep(random.randint(1, 3)) 140 | -------------------------------------------------------------------------------- /DataMartBasicSkills-req.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | from urllib import parse 4 | import json 5 | import pickle 6 | import time 7 | import re 8 | import glob 9 | 10 | 11 | class BasicSkillsCollege: 12 | 13 | def __init__(self, college): 14 | 15 | self.sess = requests.session() 16 | self.url = 'http://datamart.cccco.edu/Outcomes/BasicSkills_Cohort_Tracker.aspx' 17 | self.init_req = self.sess.get(self.url) 18 | self.init_req_soup = BeautifulSoup(self.init_req.content, 'html5lib') 19 | self.init_states = {tag['name']: tag['value'] 20 | for tag in self.init_req_soup.select('input[name^=__]')} 21 | self.college = college 22 | print(self.college) 23 | 24 | def parse_params(self, r): 25 | lst = re.search(r'\[.+\]', r.text).group() 26 | terms = lst.replace( 27 | '"', 28 | '').replace( 29 | '[', 30 | '').replace( 31 | ']', 32 | '').replace( 33 | "'", 34 | "").split(',') 35 | terms = [x.strip() for x in terms] 36 | 37 | tps = [] 38 | for i in range(len(terms)): 39 | if (i + 2) % 2 == 0: 40 | tps.append((terms[i + 1], terms[i])) 41 | 42 | return tps 43 | 44 | def get_s_terms(self): 45 | data = self.init_states 46 | data['__CALLBACKID'] = 'ASPxRoundPanel1$ASPxComboBoxSTerm' 47 | data['__CALLBACKPARAM'] = 'c0:LECC|0;;LBCRI|4;0:-2;' 48 | data['DXScript'] = '1_243,1_138,1_237,1_164,1_141,1_135,1_226,1_234,1_162,1_170,1_161,1_229,1_159,1_227,1_165,1_143,1_176,1_151,1_232,1_149,7_50,7_53,7_48,7_52,1_235,1_218,1_228,1_210,1_184,1_136' 49 | data['DXCss'] = '0_224,1_28,0_226,0_115,1_10,0_117,0_143,7_2,0_145,../css/styles.css,../css/navigation-mininav.css,../css/design01.css,../css/footer-without-dark-container.css' 50 | data['ASPxRoundPanel1$ASPxComboBoxColl'] = self.college[0] 51 | data['ASPxRoundPanel1_ASPxComboBoxColl_VI'] = self.college[1] 52 | data['ASPxRoundPanel1$ASPxComboBoxColl$DDD$L'] = self.college[1] 53 | 54 | req = self.sess.post(self.url, data=data) 55 | 56 | sterms = self.parse_params(req) 57 | spring_2006 = [x[0] for x in sterms].index('Spring 2006') 58 | sterms = sterms[:spring_2006 + 1][::-1] 59 | 60 | return (data, sterms) 61 | 62 | def get_skills(self): 63 | data, sterms = self.get_s_terms() 64 | data['__CALLBACKID'] = 'ASPxRoundPanel1$ASPxComboBoxBSSub' 65 | data['ASPxRoundPanel1$ASPxComboBoxSTerm'] = sterms[0][0] 66 | data['ASPxRoundPanel1_ASPxComboBoxSTerm_VI'] = sterms[0][1] 67 | data['ASPxRoundPanel1$ASPxComboBoxSTerm$DDD$L'] = sterms[0][1] 68 | data['ASPxRoundPanel1$ASPxComboBoxETerm'] = sterms[0][0] 69 | data['ASPxRoundPanel1_ASPxComboBoxETerm_VI'] = sterms[0][1] 70 | data['ASPxRoundPanel1$ASPxComboBoxETerm$DDD$L'] = sterms[0][1] 71 | 72 | req = self.sess.post(self.url, data=data) 73 | skills = self.parse_params(req) 74 | 75 | return (data, sterms, skills) 76 | 77 | def get_levels(self): 78 | data, sterms, skills = self.get_skills() 79 | college_params = [] 80 | for i in range(len(sterms)): 81 | params = {} 82 | for i2 in range(len(sterms) - i): 83 | for i3 in range(len(skills)): 84 | if "ESL" not in skills[i3][0]: 85 | params['sterm'] = sterms[i] 86 | params['eterm'] = sterms[i2 + i] 87 | params['skill'] = skills[i3] 88 | data['__CALLBACKID'] = 'ASPxRoundPanel1$ASPxComboBoxPL' 89 | data['ASPxRoundPanel1$ASPxComboBoxSTerm'] = params[ 90 | 'sterm'][0] 91 | data['ASPxRoundPanel1_ASPxComboBoxSTerm_VI'] = params[ 92 | 'sterm'][1] 93 | data['ASPxRoundPanel1$ASPxComboBoxSTerm$DDD$L'] = params[ 94 | 'sterm'][1] 95 | data['ASPxRoundPanel1$ASPxComboBoxETerm'] = params[ 96 | 'eterm'][0] 97 | data['ASPxRoundPanel1_ASPxComboBoxETerm_VI'] = params[ 98 | 'eterm'][1] 99 | data['ASPxRoundPanel1$ASPxComboBoxETerm$DDD$L'] = params[ 100 | 'eterm'][1] 101 | data['ASPxRoundPanel1$ASPxComboBoxBSSub'] = params[ 102 | 'skill'][0] 103 | data['ASPxRoundPanel1_ASPxComboBoxBSSub_VI'] = params[ 104 | 'skill'][1] 105 | data['ASPxRoundPanel1$ASPxComboBoxBSSub$DDD$L'] = params[ 106 | 'skill'][1] 107 | 108 | req = self.sess.post(self.url, data=data) 109 | 110 | try: 111 | levels = self.parse_params(req) 112 | 113 | for l in levels: 114 | params['sterm'] = sterms[i] 115 | params['eterm'] = sterms[i2 + i] 116 | params['skill'] = skills[i3] 117 | params['level'] = l 118 | college_params.append(params) 119 | params = {} 120 | 121 | except: 122 | pass 123 | 124 | pickle.dump( 125 | college_params, 126 | open('./pickles/' + self.college[0] + '.pkl', 'wb')) 127 | return college_params 128 | 129 | def dl_csv(self): 130 | config = pickle.load( 131 | open( 132 | './pickles/' + 133 | self.college[0] + 134 | '.pkl', 135 | 'rb')) 136 | num_configs = len(config) 137 | 138 | params_json = json.load(open('pickles/dump.HAR')) 139 | params1 = {parse.unquote(d['name']): parse.unquote(d['value']) for d in params_json[ 140 | 'log']['entries'][-6]['request']['postData']['params']} 141 | params2 = {parse.unquote(d['name']): parse.unquote(d['value']) for d in params_json[ 142 | 'log']['entries'][-1]['request']['postData']['params']} 143 | 144 | headers = {d['name']: d['value'] for d in params_json[ 145 | 'log']['entries'][-1]['request']['headers']} 146 | del headers['Content-Length'] 147 | del headers['Cookie'] 148 | 149 | cookies = {'Cookie': 'ASP.NET_SessionId' + '=' + 150 | self.init_req.cookies.get_dict()['ASP.NET_SessionId']} 151 | self.sess.headers.update(cookies) 152 | 153 | data = self.init_states 154 | 155 | for k in data.keys(): 156 | params1[k] = data[k] 157 | params2[k] = data[k] 158 | 159 | for i, c in enumerate(config): 160 | print(i, num_configs, c) 161 | 162 | for p in (params1, params2): 163 | p['ASPxRoundPanel1$ASPxComboBoxColl'] = self.college[0] 164 | p['ASPxRoundPanel1_ASPxComboBoxColl_VI'] = self.college[1] 165 | p['ASPxRoundPanel1$ASPxComboBoxColl$DDD$L'] = self.college[1] 166 | p['ASPxRoundPanel1$ASPxComboBoxSTerm'] = c['sterm'][0] 167 | p['ASPxRoundPanel1_ASPxComboBoxSTerm_VI'] = c['sterm'][1] 168 | p['ASPxRoundPanel1$ASPxComboBoxSTerm$DDD$L'] = c['sterm'][1] 169 | p['ASPxRoundPanel1$ASPxComboBoxETerm'] = c['eterm'][0] 170 | p['ASPxRoundPanel1_ASPxComboBoxETerm_VI'] = c['eterm'][1] 171 | p['ASPxRoundPanel1$ASPxComboBoxETerm$DDD$L'] = c['eterm'][1] 172 | p['ASPxRoundPanel1$ASPxComboBoxBSSub'] = c['skill'][0] 173 | p['ASPxRoundPanel1_ASPxComboBoxBSSub_VI'] = c['skill'][1] 174 | p['ASPxRoundPanel1$ASPxComboBoxBSSub$DDD$L'] = c['skill'][1] 175 | p['ASPxRoundPanel1$ASPxComboBoxPL'] = c['level'][0] 176 | p['ASPxRoundPanel1_ASPxComboBoxPL_VI'] = c['level'][1] 177 | p['ASPxRoundPanel1$ASPxComboBoxPL$DDD$L'] = c['level'][1] 178 | 179 | params2['__EVENTTARGET'] = 'buttonSaveAs' 180 | params2['listExportFormat'] = '1' 181 | 182 | # need to start sesh 183 | r = self.sess.post(self.url, data=params1) 184 | 185 | # now get full report 186 | r = self.sess.post(self.url, data=params2) 187 | 188 | with open("data/" + self.college[0] + '-' + c['sterm'][1] + '-' + c['eterm'][1] + '-' + c['skill'][1] + '-' + c['level'][1] + '.csv', 'w') as f: 189 | f.write(r.text) 190 | 191 | pickle.dump(config[i + 1:], 192 | open('./pickles/' + self.college[0] + '.pkl', 'wb')) 193 | 194 | time.sleep(1) 195 | 196 | if __name__ == "__main__": 197 | colleges = pickle.load(open('./pickles/college_list.pkl', 'rb')) 198 | colleges = colleges[:5] 199 | 200 | for c in colleges: 201 | if not './pickles/' + c[0] + '.pkl' in glob.glob('./pickles/*.pkl'): 202 | BasicSkillsCollege((c[0], c[1])).get_levels() 203 | 204 | BasicSkillsCollege((c[0], c[1])).dl_csv() 205 | -------------------------------------------------------------------------------- /PACER-selenium.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import random 3 | import time 4 | from bs4 import BeautifulSoup 5 | import os 6 | import csv 7 | import datetime 8 | from send_email import send_email 9 | import glob 10 | import subprocess 11 | import re 12 | from pyvirtualdisplay import Display 13 | import sys 14 | 15 | 16 | def sift_chars(fname_str): 17 | ''' 18 | ensures filename is legal, replaces all with hyphens 19 | ''' 20 | 21 | illegal_chars = "%> 30: 98 | break 99 | 100 | time.sleep(1) # case has been found, proceed 101 | 102 | driver.find_element_by_name("date_from").clear() 103 | driver.find_element_by_name("date_from").send_keys("01/01/1990") 104 | driver.find_element_by_name("date_to").clear() 105 | driver.find_element_by_name("date_to").send_keys( 106 | datetime.date.today().strftime("%m/%d/%Y")) 107 | 108 | time.sleep(1) 109 | driver.find_element_by_name('button1').click() 110 | 111 | # get source to get docket info 112 | docket_source = str(driver.page_source) 113 | soup = BeautifulSoup(docket_source, 'html5lib') 114 | 115 | # set start for row, will change if scrape was interrupted 116 | row_start = 0 117 | 118 | # get associated cases if main case 119 | if case_num: 120 | 121 | with open(district + "/" + je_id + "/" + je_id + "_data.csv", 'r', encoding="utf-8") as f: 122 | reader = csv.reader(f) 123 | data = list(reader) 124 | 125 | if len(data) == 1: 126 | get_associated_cases(soup) 127 | # save docket source if main case 128 | with open(district + "/" + je_id + "/" + str(je_id) + ".html", "w", encoding="utf-8") as f: 129 | f.write(docket_source) 130 | 131 | else: 132 | row_start = len(data) - 1 133 | 134 | else: 135 | 136 | if os.path.exists( 137 | district + 138 | "/" + 139 | je_id + 140 | "/associated/" + 141 | str(case_num) + 142 | "/" + 143 | 'assoc_data.csv'): 144 | with open(district + "/" + je_id + "/associated/" + str(case_num) + "/" + 'assoc_data.csv', 'r', encoding="utf-8") as f: 145 | reader = csv.reader(f) 146 | data = list(reader) 147 | 148 | row_start = len(data) - 1 149 | 150 | docket_rows = [] 151 | for i in range(len(soup.findAll("table")) - 5): 152 | # table is broken up to sets of 100 rows, don't want first 4 or last 153 | ind = i + 4 154 | docket_table = soup.findAll("table")[ind] 155 | docket_headers = ("Filing Date", "#", "Docket Text") 156 | 157 | # get table info in dict 158 | for row in docket_table.findAll("tr"): 159 | row_data = [] 160 | for i, column in enumerate(row.findAll("td")): 161 | if i == 0: 162 | row_data.append(column.text) 163 | elif i == 2: 164 | cell_urls = {} 165 | urls = column.findAll("a") 166 | for u in urls: 167 | cell_urls[u.text.strip()] = u.get("href") 168 | 169 | row_data.append((column.text.strip(), cell_urls)) 170 | 171 | elif i > 2: 172 | row_data.append(column.text.strip()) 173 | 174 | if len(row_data) > 0: 175 | docket_rows.append(tuple(row_data)) 176 | 177 | return docket_rows[row_start:] 178 | 179 | 180 | def process_link( 181 | link_str, 182 | base_url, 183 | district, 184 | already_scraped, 185 | adversary=False, 186 | dock_num=False): 187 | ''' 188 | takes any links to documents, and downloads them into file structure 189 | ''' 190 | 191 | if link_str.startswith("https://"): 192 | pass 193 | else: 194 | link_str = base_url + link_str 195 | 196 | driver.get(link_str) 197 | f_paths = [] 198 | 199 | if "Multiple Documents" in str(driver.page_source): 200 | soup = BeautifulSoup(str(driver.page_source), 'html5lib') 201 | doc_table = soup.findAll("tr") 202 | for r in doc_table: 203 | if "href" in str(r): 204 | tds = r.findAll("td") 205 | doc_url = tds[0].a["href"] 206 | dl_id = doc_url.split("/")[-1] 207 | if dl_id not in already_scraped: 208 | os.system('rm *.pdf') 209 | if doc_url.startswith("https://"): 210 | driver.get(doc_url) 211 | driver.find_element_by_xpath( 212 | '//*[@id="cmecfMainContent"]/form/input').click() 213 | else: 214 | doc_url = base_url + doc_url 215 | driver.get(doc_url) 216 | driver.find_element_by_xpath( 217 | '//*[@id="cmecfMainContent"]/form/input').click() 218 | 219 | file_name = tds[2].text 220 | new_name = sift_chars(file_name.strip()) + ".pdf" 221 | 222 | # if not associated case 223 | # create file structure 224 | if not adversary: 225 | if not os.path.exists( 226 | district + "/" + je_id + "/" + docket_number): 227 | os.makedirs( 228 | district + "/" + je_id + "/" + docket_number) 229 | 230 | new_path = district + "/" + je_id + "/" + docket_number + "/" + new_name 231 | 232 | else: 233 | if not os.path.exists( 234 | district + 235 | "/" + 236 | je_id + 237 | "/associated/" + 238 | adversary + 239 | "/" + 240 | dock_num): 241 | os.makedirs( 242 | district + 243 | "/" + 244 | je_id + 245 | "/associated/" + 246 | adversary + 247 | "/" + 248 | dock_num) 249 | 250 | new_path = district + "/" + je_id + "/associated/" + \ 251 | adversary + "/" + dock_num + "/" + new_name 252 | 253 | # wait for file to download 254 | counter = 0 255 | while len(glob.glob("*.pdf")) == 0: 256 | time.sleep(1) 257 | counter += 1 258 | if counter > 500: 259 | break 260 | 261 | time.sleep(4) 262 | download_name = glob.glob("*.pdf")[0] 263 | os.rename( 264 | download_name, re.sub( 265 | r'[^\x00-\x7f]', '-', new_path)) 266 | 267 | already_scraped.append(dl_id) 268 | f_paths.append(new_path) 269 | 270 | time.sleep(1) 271 | os.system('rm *.pdf') 272 | time.sleep(1) 273 | 274 | else: 275 | soup = BeautifulSoup(str(driver.page_source), 'html5lib') 276 | 277 | restricted = False 278 | 279 | try: 280 | dl_id = soup.find("form")["action"].split("/")[-1] 281 | 282 | except: 283 | if "The document is restricted" in driver.page_source: 284 | restricted = True 285 | elif "document is not available" in driver.page_source: 286 | restricted = True 287 | 288 | if not restricted: 289 | os.system('rm *.pdf') 290 | driver.find_element_by_xpath( 291 | '//*[@id="cmecfMainContent"]/form/input').click() 292 | 293 | if dl_id not in already_scraped: 294 | 295 | # create file structure 296 | if not adversary: 297 | if not os.path.exists( 298 | district + "/" + je_id + "/" + docket_number): 299 | os.makedirs( 300 | district + "/" + je_id + "/" + docket_number) 301 | 302 | new_path = district + "/" + je_id + "/" + docket_number + "/Main Document.pdf" 303 | 304 | else: 305 | if not os.path.exists( 306 | district + 307 | "/" + 308 | je_id + 309 | "/associated/" + 310 | adversary + 311 | "/" + 312 | dock_num): 313 | os.makedirs( 314 | district + 315 | "/" + 316 | je_id + 317 | "/associated/" + 318 | adversary + 319 | "/" + 320 | dock_num) 321 | 322 | new_path = district + "/" + je_id + "/associated/" + \ 323 | adversary + "/" + dock_num + "/Main Document.pdf" 324 | 325 | # wait for file to download 326 | counter = 0 327 | while len(glob.glob("*.pdf")) == 0: 328 | time.sleep(1) 329 | counter += 1 330 | if counter > 500: 331 | break 332 | 333 | time.sleep(4) 334 | download_name = glob.glob("*.pdf")[0] 335 | os.rename( 336 | download_name, re.sub( 337 | r'[^\x00-\x7f]', '-', new_path)) 338 | 339 | already_scraped.append(dl_id) 340 | f_paths.append(new_path) 341 | 342 | time.sleep(1) 343 | os.system('rm *.pdf') 344 | time.sleep(1) 345 | 346 | else: 347 | f_paths.append("RESTRICTED") 348 | time.sleep(5) 349 | 350 | return (f_paths, already_scraped) 351 | 352 | 353 | def get_associated_cases(soup): 354 | 355 | ass_exist = True 356 | 357 | try: 358 | ass_cases_ext = soup.findAll("div", {"class": "noprint"})[ 359 | 1].find("a")["href"] 360 | 361 | except: 362 | ass_exist = False 363 | 364 | if ass_exist: 365 | driver.get(base_url + ass_cases_ext) 366 | driver.find_element_by_xpath('//*[@id="referrer_form"]/p/a').click() 367 | soup = BeautifulSoup(str(driver.page_source), "html5lib") 368 | 369 | assoc_rows = soup.find("table").findAll("tr") 370 | 371 | if not os.path.exists(district + "/" + je_id + "/" + "associated"): 372 | os.makedirs(district + "/" + je_id + "/" + "associated") 373 | 374 | with open(district + "/" + je_id + "/" + str(je_id) + "_associated_cases.html", "w", encoding="utf-8") as f: 375 | f.write(str(driver.page_source)) 376 | 377 | # if interrupted start from where last row 378 | if os.path.exists( 379 | str(district) + 380 | "/" + 381 | str(je_id) + 382 | "/" + 383 | str(je_id) + 384 | '_associated_cases.csv'): 385 | with open(str(district) + "/" + str(je_id) + "/" + str(je_id) + '_associated_cases.csv', 'r', encoding="utf-8") as f: 386 | reader = csv.reader(f) 387 | data = list(reader) 388 | 389 | if len(data) - 1 == len(assoc_rows): 390 | assoc_rows = [assoc_rows[-1]] 391 | else: 392 | assoc_rows = assoc_rows[len(data) - 2:] 393 | 394 | else: 395 | with open(str(district) + "/" + str(je_id) + "/" + str(je_id) + '_associated_cases.csv', 'a', encoding="utf-8") as f: 396 | w = csv.writer(f, delimiter=',') 397 | header = ( 398 | "je_id", 399 | "Related Case No", 400 | "Caption", 401 | "Type", 402 | "Judge", 403 | "Plaintiff", 404 | "Defendant", 405 | "Plaintiff Lawyer", 406 | "Defendant Lawyer", 407 | "Date Filed", 408 | "Date Terminated", 409 | "Nature of Suit") 410 | w.writerow(header) 411 | 412 | for row in assoc_rows: # CHANGE FOR FULL 413 | columns = row.findAll("td") 414 | if len(columns) > 0: 415 | 416 | case_ext = columns[1].find("a")["href"] 417 | case_num = columns[1].find("a").text 418 | caption = ' '.join(columns[1].text.split()[1:]) 419 | case_type = columns[2].text 420 | 421 | row_to_write = (je_id, case_num, caption, case_type) 422 | 423 | with open(str(district) + "/" + str(je_id) + "/" + str(je_id) + '_associated_cases.csv', 'a', encoding="utf-8") as f: 424 | w = csv.writer(f, delimiter=',') 425 | w.writerow(row_to_write) 426 | 427 | driver.get(base_url + case_ext) 428 | 429 | docket_rows = get_docket_rows( 430 | driver=driver, 431 | case_num=False, 432 | year=False, 433 | court_perl=False) 434 | 435 | if not os.path.exists( 436 | district + "/" + je_id + "/associated/" + case_num): 437 | os.makedirs( 438 | district + "/" + je_id + "/associated/" + case_num) 439 | 440 | with open(district + "/" + je_id + "/associated/" + case_num + "/" + str(case_num) + ".html", "w", encoding="utf-8") as f: 441 | f.write(str(driver.page_source)) 442 | 443 | if os.path.exists( 444 | district + 445 | "/" + 446 | je_id + 447 | "/associated/" + 448 | str(case_num) + 449 | "/" + 450 | 'assoc_data.csv'): 451 | with open(district + "/" + je_id + "/associated/" + str(case_num) + "/" + 'assoc_data.csv', 'r', encoding="utf-8") as f: 452 | reader = csv.reader(f) 453 | data = list(reader) 454 | 455 | docket_rows = docket_rows[len(data) - 1:] 456 | 457 | else: 458 | with open(district + "/" + je_id + "/associated/" + str(case_num) + "/" + 'assoc_data.csv', 'a', encoding="utf-8") as f: 459 | w = csv.writer(f, delimiter=',') 460 | header = ( 461 | "je_id", 462 | "case_num", 463 | "docket_text", 464 | "docket_number", 465 | "docket_date", 466 | "file_link", 467 | "[lawfirm1]", 468 | "[lawyers1]", 469 | "[lawfirm2]", 470 | "[lawyers2]", 471 | "[lawfirm3]", 472 | "[lawyers3]", 473 | "[moving party]", 474 | "[motion caption]") 475 | w.writerow(header) 476 | 477 | for row in docket_rows: # just 20 rows CHANGE FOR FULL 478 | docket_date = row[0] 479 | docket_text = row[2].strip() 480 | if len( 481 | row[1]) > 1 and len( 482 | row[1][0]) > 0 and row[1][0][0].isdigit(): 483 | docket_number = row[1][0].split()[0] 484 | 485 | else: 486 | with open(district + "/" + je_id + "/associated/" + str(case_num) + "/" + 'assoc_data.csv', 'r', encoding="utf-8") as f: 487 | reader = csv.reader(f) 488 | temp_data = list(reader) 489 | docket_number = temp_data[-1][-3] 490 | 491 | already_scraped = [] 492 | paths = [] 493 | for c in row: 494 | if len(c) > 1 and isinstance( 495 | c[1], dict) and len( 496 | c[1]) > 0: 497 | for k in c[1].keys(): 498 | url = c[1][k] 499 | res = process_link( 500 | link_str=url, 501 | base_url=base_url, 502 | district=district, 503 | already_scraped=already_scraped, 504 | dock_num=docket_number, 505 | adversary=case_num) 506 | file_paths = res[0] 507 | if len(file_paths) > 0: 508 | already_scraped = res[1] 509 | paths.extend(file_paths) 510 | 511 | # wait after each link call 512 | time.sleep(random.randint(1, 3)) 513 | 514 | csv_row = [ 515 | je_id, 516 | case_num, 517 | docket_text, 518 | docket_number, 519 | docket_date, 520 | "; ".join(paths)] 521 | scraped_data[district].append(csv_row) 522 | 523 | with open(district + "/" + je_id + "/associated/" + str(case_num) + "/" + 'assoc_data.csv', 'a', encoding="utf-8") as f: 524 | w = csv.writer(f, delimiter=',') 525 | w.writerow(csv_row) 526 | 527 | time.sleep(random.randint(1, 3)) 528 | 529 | 530 | # In[ ]: 531 | 532 | # main program 533 | # for case num info 534 | with open('dataset.csv', 'r', encoding="utf-8") as f: 535 | reader = csv.reader(f) 536 | data = list(reader) 537 | 538 | with open('distlogin.csv', 'r', encoding="utf-8") as f: 539 | reader = csv.reader(f) 540 | distlogin_csv = list(reader) 541 | 542 | with open('completed', 'r') as f: 543 | completed_cases = f.read().split('\n') 544 | 545 | email_address = distlogin_csv[0][0] 546 | email_password = distlogin_csv[0][1] 547 | dl_directory = distlogin_csv[0][2] 548 | district = distlogin_csv[1][0] 549 | 550 | # change for each district 551 | dist_data = [x for x in data if x[-2] == district] 552 | district = ''.join(district.split()) 553 | 554 | distlogin = {} 555 | 556 | for r in distlogin_csv[1:]: 557 | distlogin[district] = {"login": r[1], 558 | "pw": r[2], 559 | "base_url": r[3]} 560 | 561 | # prepare and loop 562 | scraped_data = {} 563 | scraped_data[district] = [] 564 | 565 | if not os.path.exists(district): 566 | os.makedirs(district) 567 | 568 | driver = login_to_pacer( 569 | login_user=distlogin[district]["login"], 570 | login_password=distlogin[district]["pw"], 571 | dl_directory=dl_directory) 572 | 573 | for case in dist_data: # just two cases CHANGE FOR FULL 574 | 575 | print(datetime.datetime.time(datetime.datetime.now())) 576 | 577 | company = case[0] 578 | je_id = case[1] 579 | case_num = case[2] 580 | petition_date = case[3] 581 | year = case[6] 582 | 583 | if je_id not in completed_cases: 584 | 585 | send_email( 586 | email_address, 587 | email_password, 588 | email_address, 589 | "New Case", 590 | "JEID" + 591 | str(je_id)) 592 | 593 | if not os.path.exists(district + "/" + je_id): 594 | os.makedirs(district + "/" + je_id) 595 | 596 | if not os.path.exists( 597 | district + 598 | "/" + 599 | je_id + 600 | "/" + 601 | je_id + 602 | "_data.csv"): 603 | # for output data 604 | with open(district + "/" + je_id + "/" + je_id + "_data.csv", 'w', encoding="utf-8") as f: 605 | w = csv.writer(f, delimiter=',') 606 | header = ( 607 | "Company", 608 | "je_id", 609 | "petition_date", 610 | "casenum", 611 | "xdistfiled", 612 | "docket_text", 613 | "docket_number", 614 | "docket_date", 615 | "file_link", 616 | "[lawfirm1]", 617 | "[lawyers1]", 618 | "[lawfirm2]", 619 | "[lawyers2]", 620 | "[lawfirm3]", 621 | "[lawyers3]", 622 | "[moving party]", 623 | "[motion caption]") 624 | w.writerow(header) 625 | 626 | # change for each district 627 | base_url = distlogin[district]["base_url"] 628 | court_perl = base_url + "/cgi-bin/DktRpt.pl" 629 | docket_rows = get_docket_rows( 630 | driver=driver, 631 | case_num=case_num, 632 | year=year, 633 | court_perl=court_perl) 634 | 635 | for row in docket_rows: # just 20 rows CHANGE FOR FULL 636 | docket_date = row[0] 637 | docket_text = row[2].strip() 638 | if len( 639 | row[1]) > 1 and len( 640 | row[1][0]) > 0 and row[1][0][0].isdigit(): 641 | docket_number = row[1][0].split()[0] 642 | else: 643 | with open(district + "/" + je_id + "/" + je_id + "_data.csv", 'r', encoding="utf-8") as f: 644 | reader = csv.reader(f) 645 | temp_data = list(reader) 646 | docket_number = temp_data[-1][-3] 647 | 648 | already_scraped = [] 649 | paths = [] 650 | for c in row: 651 | if len(c) > 1 and isinstance(c[1], dict) and len(c[1]) > 0: 652 | for k in c[1].keys(): 653 | url = c[1][k] 654 | res = process_link( 655 | link_str=url, 656 | base_url=base_url, 657 | district=district, 658 | already_scraped=already_scraped) 659 | file_paths = res[0] 660 | if len(file_paths) > 0: 661 | already_scraped = res[1] 662 | paths.extend(file_paths) 663 | 664 | # wait after each link call 665 | time.sleep(random.randint(1, 3)) 666 | 667 | csv_row = [ 668 | company, 669 | je_id, 670 | petition_date, 671 | case_num, 672 | district, 673 | docket_text, 674 | docket_number, 675 | docket_date, 676 | "; ".join(paths)] 677 | scraped_data[district].append(csv_row) 678 | 679 | with open(district + "/" + je_id + "/" + je_id + "_data.csv", 'a', encoding="utf-8") as f: 680 | w = csv.writer(f, delimiter=',') 681 | w.writerow(csv_row) 682 | 683 | with open('completed', 'a') as f: 684 | f.write('\n' + je_id) 685 | 686 | # zip and push to box 687 | p = subprocess.Popen(['bash', 688 | 'zip-push.sh', 689 | je_id], 690 | stdin=None, 691 | stdout=None, 692 | stderr=None, 693 | close_fds=True) 694 | 695 | send_email( 696 | email_address, 697 | email_password, 698 | email_address, 699 | "Finished", 700 | "Done scraping." + 701 | str(je_id)) 702 | print(datetime.datetime.time(datetime.datetime.now())) 703 | --------------------------------------------------------------------------------