├── data └── README.md ├── README.md └── src ├── trump_crawler3.py ├── trump_crawler.py ├── scrape_faculty.py ├── RoParlScraper.py └── basic_webscraping.ipynb /data/README.md: -------------------------------------------------------------------------------- 1 | # This folder will contain all data files output by the code we run 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tutorials for web scraping and crawling 2 | 3 | This repository contains a basic tutorial for scraping and crawling websites using 4 | Python. It also includes some examples of code that has been written to scrape other 5 | websites. 6 | 7 | # Set-up 8 | 9 | You will need to have Python 3.5 installed. As a lab we are moving onto the new version of Python and the tutorials will not work with Python 2. You can download it [here](https://www.python.org/downloads/). You can also download it via [Anaconda](https://www.continuum.io/downloads), which will include many of the packages in the distribution. 10 | 11 | You will also need to have [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) installed. You should make a Github account if you haven’t already. 12 | 13 | You can also consult our [wiki](https://github.com/socdyn/wiki/blob/master/vesta/get_started_with_python.md) for advice on setting this up: 14 | 15 | When you have it installed please install the following packages using `pip3`, e.g `pip3 install jupyter`(if you never installed Python2 then you might be able to just use `pip`). If you are using Anaconda then please try to install the packages with `conda` first and if it doesn't work use `pip3`. 16 | 17 | ``` 18 | jupyter 19 | beautifulsoup4 20 | requests 21 | selenium 22 | pandas 23 | json 24 | ``` 25 | 26 | You should make sure you have Jupyter notebooks running correctly, you can find information on running it [here](http://jupyter.readthedocs.io/en/latest/content-quickstart.html). You should be able to open a notebook in your browser by typing `jupyter notebooks` into the command line. From here you can then create a Python3 notebook by clicking the ‘New’ tab on the right and selecting ‘Python3’ or ‘IPython3’. You can then import all the packages into the notebook to check they were installed. 27 | -------------------------------------------------------------------------------- /src/trump_crawler3.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import json 3 | import time 4 | from selenium import webdriver # This code is necessary to pull from Javascript 5 | driver = webdriver.PhantomJS() 6 | 7 | def page_iterator(BASE_URL, NUMBER_OF_PAGES_TRUMP): 8 | for i in range(1, NUMBER_OF_PAGES_TRUMP + 1): 9 | url = BASE_URL + str(i) 10 | yield url 11 | i+=1 12 | 13 | def get_page_content(url): 14 | driver.get(url) # Get the webpage 15 | # Convert it to a BS object - "soup" 16 | soup = BeautifulSoup(driver.page_source, "html.parser") 17 | return soup 18 | 19 | def find_valid_links(soup): 20 | press_release_urls = [] 21 | # FINDING AND STORING LINKS TO INDIVIDUAL PRESS RELEASES 22 | for link in soup.findAll('a', href=True): 23 | candidate_link = link['href'] 24 | # two simple criteria for determining if this is a press release url 25 | if "press-release" in candidate_link: 26 | if len(candidate_link) > MIN_TRUMP_URL_LEN: 27 | press_release_urls.append(candidate_link) 28 | return press_release_urls 29 | 30 | def process_press_releases(press_release_urls, VISITED_URLS): 31 | for pr_url in press_release_urls: 32 | if pr_url not in VISITED_URLS: 33 | time.sleep(1) # limit calls to 1 per second 34 | 35 | soup = get_page_content(pr_url) 36 | content = soup.find_all('p') #gets all objects with 'p' html tag 37 | paragraphs = [] 38 | for c in content: 39 | c_text = c.getText() 40 | paragraphs.append(c_text) 41 | # we don't need the first or last 5 elements 42 | # so we slice them out (this is through trial and error) 43 | trimmed_paragraphs = paragraphs[1:-5] 44 | press_release_text = "".join(trimmed_paragraphs) 45 | 46 | # CREATING DICTIONARY 47 | press_release_dict = { 48 | "text": press_release_text, 49 | "url": pr_url, 50 | "author": "Trump", 51 | } 52 | VISITED_URLS.add(pr_url) 53 | yield press_release_dict 54 | 55 | def write_to_json(press_release_dict): 56 | with open(OUTPUT_PATH, 'a') as f: 57 | # turns dict into valid json string on 1 line 58 | j = json.dumps(press_release_dict) + '\n' 59 | # writes j to file f 60 | f.write(j) 61 | 62 | if __name__ == '__main__': 63 | 64 | # Website url 65 | BASE_URL = "https://www.donaldjtrump.com/press-releases/P" 66 | 67 | # Constants 68 | NUMBER_OF_PAGES_TRUMP = 1000 69 | 70 | # Min length of a valid press release url 71 | MIN_TRUMP_URL_LEN = 50 72 | 73 | # Where we save the data output 74 | OUTPUT_PATH = '../data/trump_website.json' 75 | 76 | # This set will contain all visited press release urls 77 | VISITED_URLS = set() 78 | 79 | pages = page_iterator(BASE_URL, NUMBER_OF_PAGES_TRUMP) 80 | for p in pages: 81 | soup = get_page_content(p) 82 | prs = find_valid_links(soup) 83 | processed_prs = process_press_releases(prs, VISITED_URLS) 84 | for pprs in processed_prs: 85 | print(pprs['text']) 86 | write_to_json(pprs) 87 | -------------------------------------------------------------------------------- /src/trump_crawler.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from urllib2 import urlopen 3 | import json 4 | import time 5 | 6 | # This code is necessary to pull from Javascript 7 | from selenium import webdriver 8 | driver = webdriver.PhantomJS() 9 | 10 | # Website url 11 | base_url_trump = "https://www.donaldjtrump.com/press-releases/P" 12 | 13 | # Constants 14 | # Number of pages of press releases 15 | NUMBER_OF_PAGES_TRUMP = 90 16 | # Min length of a valid press release url 17 | MIN_TRUMP_URL_LEN = 50 18 | 19 | # Where we save the data output 20 | OUTPUT_PATH = '../data/trump_website.json' 21 | 22 | # This set will contain all visited press release urls 23 | press_release_url_set = set() 24 | 25 | # Main Body 26 | 27 | """ 28 | The intuition behind this is simple: 29 | 1) We get the press release urls from each page of press releases 30 | 2) We then go through the press release urls and get press release text 31 | 3) We append them to a newline-delimited json file 32 | """ 33 | 34 | # CRAWLING THROUGH EACH PAGE OF PRESS RELEASES 35 | for i in range(1, NUMBER_OF_PAGES_TRUMP + 1): 36 | 37 | press_release_urls = [] #empty list to store urls 38 | 39 | # READING EACH WEBPAGE 40 | url = base_url_trump + str(i) # Concatenate url with index value 41 | #page = urlopen(url).read() # Demo these two lines 42 | #soup = BeautifulSoup(page) 43 | driver.get(url) # Get the webpage 44 | # Convert it to a BS object - "soup" 45 | soup = BeautifulSoup(driver.page_source) 46 | 47 | # FINDING AND STORING LINKS TO INDIVIDUAL PRESS RELEASES 48 | for link in soup.findAll('a', href=True): 49 | candidate_link = link['href'] 50 | # two simple criteria for determining if this is a press release url 51 | if "press-release" in candidate_link: 52 | if len(candidate_link) > MIN_TRUMP_URL_LEN: 53 | press_release_urls.append(candidate_link) 54 | 55 | #PROCESSING PRESS RELEASES 56 | for pr_url in press_release_urls: 57 | if pr_url not in press_release_url_set: 58 | time.sleep(1) # limit calls to 1 per second 59 | press_release_url_set.add(pr_url) 60 | driver.get(pr_url) 61 | soup = BeautifulSoup(driver.page_source) 62 | content = soup.find_all('p') 63 | print ( 64 | "START OF NEW PRESS RELEASE WITH LENGTH {}!".format(len(content)) 65 | ) 66 | paragraphs = [] 67 | for c in content: 68 | c_text = c.getText() 69 | paragraphs.append(c_text) 70 | # we don't need the first or last 5 elements 71 | # so we slice them out 72 | trimmed_paragraphs = paragraphs[1:-5] 73 | press_release_text = "".join(trimmed_paragraphs) 74 | 75 | # CREATING DICTIONARY 76 | press_release_dict = { 77 | "text": press_release_text, 78 | "url": pr_url, 79 | "author": "Trump", 80 | } 81 | 82 | # WRITING DICTIONARY TO JSON 83 | with open(OUTPUT_PATH, 'a') as f: 84 | # turns dict into valid json string on 1 line 85 | j = json.dumps(press_release_dict) + '\n' 86 | # writes j to file f 87 | f.write(j) 88 | 89 | i+=1 # incrementing index by 1 90 | -------------------------------------------------------------------------------- /src/scrape_faculty.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import string 3 | from bs4 import BeautifulSoup as BS 4 | from time import sleep 5 | import pandas as pd 6 | 7 | def getSoup(url): 8 | """Input: A valid url. 9 | Gets the HTML associated with a url and 10 | converts it to a BeautifulSoup object. 11 | Then sleeps for a short time. 12 | Returns: A BeautifulSoup object.""" 13 | html = requests.get(url) 14 | soup = BS(html.content, "html.parser") 15 | sleep(6) 16 | return soup 17 | 18 | def getFacultyPages(base_url): 19 | """Input: The base url for the 20 | Cornell sociology faculty site. 21 | Finds all valid links to faculty profiles 22 | then visits each link and gets the soup 23 | object. 24 | Returns: An iterator of faculty profiles 25 | soup objects""" 26 | soup = getSoup(base_url) 27 | links = soup.findAll('a', href=True) 28 | profiles = [] 29 | for l in links: 30 | if "/people/faculty/" in l['href']: 31 | profiles.append(l['href']) 32 | profiles = [x for x in profiles if x.endswith('faculty/') == False] 33 | profiles = set(profiles) 34 | for p in profiles: 35 | yield getSoup(p) 36 | 37 | def getFacultyInfo(soup): 38 | """Input: A soup object related to a faculty webpage. 39 | Finds the section of the page with the class 40 | entry-content, based on inspection of relevant pages, 41 | Returns: A soup object with the relevant content.""" 42 | info = soup.find('div', {'class': 'entry-content'}) 43 | return info 44 | 45 | def getTitleAndEducation(info): 46 | """Input: A soup object output by getFacultyInfo() 47 | that contains information on a faculty member. 48 | This function parses this object to get the title 49 | and education for a given faculty member. 50 | Reutrns: A tuple containing title and education 51 | strings.""" 52 | info_refined = info.findAll('h4') 53 | titles = info_refined[0].text 54 | title_and_education = titles.split('PhD') 55 | title = title_and_education[0] 56 | title = ''.join(x for x in title if x not in string.punctuation) 57 | title = title.rstrip() 58 | education = 'PhD'+title_and_education[1] 59 | education = education.split('Curriculum')[0].rstrip() 60 | return title, education 61 | 62 | def getFacultyName(soup): 63 | """Input: The soup object for a faculty page. 64 | Gets the faculty members name as it appears on the page. 65 | Returns: The name as a string.""" 66 | name_info = soup.findAll('h1', {'class':'entry-title'}) 67 | name = name_info[0].text 68 | return name 69 | 70 | if __name__ == '__main__': 71 | URL = "http://www.soc.cornell.edu/people/faculty/" 72 | faculty_pages= getFacultyPages(URL) 73 | faculty_info = {} 74 | for fp in faculty_pages: 75 | name = getFacultyName(fp) 76 | print("Getting information for ", name) 77 | try: 78 | info = getFacultyInfo(fp) 79 | title, education = getTitleAndEducation(info) 80 | print("Information obtained for ", name) 81 | except: 82 | print("Failed to get info from page for ", name) 83 | title, education = None, None 84 | faculty_info[name] = {'title':title, 'education':education} 85 | print(faculty_info) 86 | df = pd.DataFrame.from_dict(faculty_info, orient='index') 87 | df.to_csv('../data/facultyinfo2.csv',encoding='utf-8') 88 | -------------------------------------------------------------------------------- /src/RoParlScraper.py: -------------------------------------------------------------------------------- 1 | #Started April 18, 2016, last edit May 9, 2016 2 | #Radu Parvulescu 3 | 4 | """ Scrapes and stores html of transcripts from Romanian Chamber of Deputies. """ 5 | 6 | import time 7 | import random 8 | import requests 9 | import codecs 10 | import json 11 | import os 12 | import re 13 | import natsort 14 | from bs4 import BeautifulSoup 15 | import glob 16 | 17 | def Scarpe(): 18 | """ Scrapes the html of transcripts of Lower House sessions (LH only or joint meetings with Senate). """ 19 | 20 | for x in range(1,10): 21 | url = 'http://www.cdep.ro/pls/steno/steno2015.stenograma?ids=' + str(x) + '&idl=1' 22 | 23 | #for transparent research, tell sysadmin who I am 24 | headers = {'user-agent' : 'Mozilla 45.0 (Linux Mint 17); Radu Parvulescu/Cornell University/rap348@cornell.edu'} 25 | 26 | #stagger requests so we don't overload target site 27 | time.sleep(random.uniform(0.1,1)) 28 | 29 | #get html text and transmit header 30 | html = requests.get(url, headers=headers).text 31 | 32 | #get date-time you retrieved document 33 | retrieved = time.time() 34 | 35 | #dump into dictionary 36 | data = {'number':x, 'html':html, 'retrieved':retrieved, 'url':url} 37 | 38 | #dump in file, line-delimited json 39 | out_file_name = '../data/'+str(x) + '.txt' 40 | with codecs.open(out_file_name, 'w', encoding='UTF8') as outfile: 41 | outfile.write(json.dumps(data)) 42 | outfile.write('\n') 43 | 44 | #tells me where I am 45 | print x 46 | 47 | def ExtractSpeech(): 48 | 49 | """ Extracts the speech and vital information from the html. 50 | 51 | PreC: Reads line-delimited json .txt files in current folder. """ 52 | 53 | #loop over html files in directory, in natural order 54 | #for FILE in natsort.natsorted(os.listdir('../data')): 55 | for FILE in natsort.natsorted(glob.glob("../data/*.txt")): 56 | 57 | #tell me where I am 58 | print FILE 59 | 60 | if FILE.endswith('.txt'): 61 | with codecs.open(FILE, 'r', encoding='UTF8') as in_file: 62 | for line in in_file: 63 | dictio = json.loads(line) 64 | #take out HTML 65 | html = dictio['html'] 66 | 67 | #this is how website indexes beginning and end of 68 | #speeches. Extract speeches and get date of debate 69 | speeches = re.findall(r'', html, re.DOTALL) 70 | #ignore pages with no speeches, make year-month-day 71 | #title of debate 72 | if len(speeches) > 0: 73 | date = re.findall(r'(.*?)', html, re.DOTALL)[0] 74 | #joint sessions of lower & upper houses don't 75 | #give the date, for some strange reason 76 | if 'din ' in date: 77 | date = date.partition('din ')[2] 78 | date = re.sub(r'\s+', ' ', date) 79 | date = re.split(' ', date) 80 | date = date[2] + '-' + date[1] + '-' + date[0] 81 | #dump in files 82 | out_file_name = '../data/debate'+ '_' + str(date) + '.txt' 83 | 84 | else: 85 | date = 'Joint Session, raw file ' + str(FILE) 86 | #dump in files 87 | out_file_name = '../data/debate'+ '_' + str(date) 88 | 89 | 90 | with codecs.open(out_file_name, 'w', encoding='UTF8') as outfile: 91 | outfile.write(date) 92 | outfile.write('\n') 93 | 94 | #iterate over speeches 95 | for s in speeches: 96 | text = s.partition('-->\n')[2] 97 | soup = BeautifulSoup(text, 'lxml') 98 | speech = soup.get_text() 99 | outfile.write(speech) 100 | 101 | 102 | if __name__ == '__main__': 103 | Scarpe() 104 | ExtractSpeech() 105 | 106 | #instead of codecs try using io package -- 'io.open' 107 | -------------------------------------------------------------------------------- /src/basic_webscraping.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# This tutorial will show you how to scrape the web using Python" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## The task is to get information about every faculty member in sociology from their department profiles. We must begin with a base URL from which we can access the profiles" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "URL = \"http://www.soc.cornell.edu/people/faculty/\"" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## There are a couple of packages we need to import" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "import requests\n", 44 | "from bs4 import BeautifulSoup as BS" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "collapsed": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "html = requests.get(URL)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "html.content" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "collapsed": false 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "soup = BS(html.content, \"html.parser\")" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [], 87 | "source": [] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "## We'll need to do these steps quite a lot so its useful to abstract it with a function " 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "collapsed": true 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "def getSoup(url):\n", 105 | " html = requests.get(url)\n", 106 | " soup = BS(html.content, \"html.parser\")\n", 107 | " return soup" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## BeautifulSoup provides some useful functions to parse the raw html" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "collapsed": true 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "links = soup.findAll('a', href=True) #Finds all 'a' tags with an href object (i.e. all hyperlinks)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "links" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "collapsed": false 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "#Let's take a look at one of these items \n", 148 | "links[20]" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": { 155 | "collapsed": false 156 | }, 157 | "outputs": [], 158 | "source": [ 159 | "type(links[20])" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "collapsed": false 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "dir(links[20])" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "collapsed": false 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "x = links[20]" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": { 188 | "collapsed": false 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "x.contents" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "collapsed": false 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "x['href']" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "## After experimenting with the object and determining what we want, we can then loop through all the objects returned by the query" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": true 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "profiles = []\n", 222 | "for l in links:\n", 223 | " if \"/people/faculty/\" in l['href']:\n", 224 | " profiles.append(l['href'])" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": { 231 | "collapsed": false 232 | }, 233 | "outputs": [], 234 | "source": [ 235 | "profiles" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": { 242 | "collapsed": false 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "##We can remove the incorrect links by applying a conditional filter to profiles\n", 247 | "profiles = [x for x in profiles if x.endswith('faculty/') == False]" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": false 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "profiles" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": { 265 | "collapsed": false 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "#Note that there are many duplicates in the list...\n", 270 | "print(len(profiles))\n", 271 | "print(len(set(profiles)))" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": { 278 | "collapsed": false 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "profiles = list(set(profiles))" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "## Now we have a list of URLs we can retrieve the information from each by looping through the list and applying the function we created. The results can be saved in a dictionary." 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": { 296 | "collapsed": false 297 | }, 298 | "outputs": [], 299 | "source": [ 300 | "from time import sleep\n", 301 | "profile_contents = {}\n", 302 | "for p in profiles:\n", 303 | " print(\"Getting information from: \", p)\n", 304 | " sleep(1) #Sleeping for a time interval so we're not querying too frequently\n", 305 | " soup = getSoup(p)\n", 306 | " name = p.split('/')[-2]\n", 307 | " profile_contents[name] = soup" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "collapsed": false 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "print(profile_contents.keys())" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": { 325 | "collapsed": false 326 | }, 327 | "outputs": [], 328 | "source": [ 329 | "#If we want to get the information for a particular professor we can look up their dictionary entry\n", 330 | "macy = profile_contents['macy']\n", 331 | "macy" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": { 338 | "collapsed": false 339 | }, 340 | "outputs": [], 341 | "source": [ 342 | "macy.find('div', {'class': 'entry-content'})" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": { 349 | "collapsed": false 350 | }, 351 | "outputs": [], 352 | "source": [ 353 | "content = macy.find('div', {'class': 'entry-content'})\n", 354 | "content.text" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": { 361 | "collapsed": false 362 | }, 363 | "outputs": [], 364 | "source": [ 365 | "content_refined = content.findAll('h4')" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": { 372 | "collapsed": false 373 | }, 374 | "outputs": [], 375 | "source": [ 376 | "content_refined[0]" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": { 383 | "collapsed": false 384 | }, 385 | "outputs": [], 386 | "source": [ 387 | "titles = content_refined[0].text" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": { 394 | "collapsed": false 395 | }, 396 | "outputs": [], 397 | "source": [ 398 | "titles.split('PhD')" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "metadata": { 405 | "collapsed": true 406 | }, 407 | "outputs": [], 408 | "source": [ 409 | "title_and_education = titles.split('PhD')" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": { 416 | "collapsed": true 417 | }, 418 | "outputs": [], 419 | "source": [ 420 | "title = title_and_education[0]\n", 421 | "education = title_and_education[1]\n", 422 | "education = 'PhD'+education" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": { 429 | "collapsed": false 430 | }, 431 | "outputs": [], 432 | "source": [ 433 | "title" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": { 440 | "collapsed": false 441 | }, 442 | "outputs": [], 443 | "source": [ 444 | "education" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": {}, 450 | "source": [ 451 | "## Let's tidy that up and make some functions we can reuse" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": { 458 | "collapsed": true 459 | }, 460 | "outputs": [], 461 | "source": [ 462 | "def getFacultyInfo(soup):\n", 463 | " info = soup.find('div', {'class': 'entry-content'})\n", 464 | " return info" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": { 471 | "collapsed": true 472 | }, 473 | "outputs": [], 474 | "source": [ 475 | "def getTitleAndEducation(info):\n", 476 | " info_refined = info.findAll('h4')\n", 477 | " titles = info_refined[0].text\n", 478 | " title_and_education = titles.split('PhD')\n", 479 | " title = title_and_education[0]\n", 480 | " education = 'PhD'+title_and_education[1]\n", 481 | " return title, education" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "metadata": { 488 | "collapsed": false 489 | }, 490 | "outputs": [], 491 | "source": [ 492 | "macy = getFacultyInfo(profile_contents['macy'])\n", 493 | "macy_te = getTitleAndEducation(macy)\n", 494 | "print(macy_te[0], macy_te[1])" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": null, 500 | "metadata": { 501 | "collapsed": false 502 | }, 503 | "outputs": [], 504 | "source": [ 505 | "heckathorn = getFacultyInfo(profile_contents['heckathorn'])\n", 506 | "heckathorn_te = getTitleAndEducation(heckathorn)\n", 507 | "print(heckathorn_te[0], heckathorn_te[1])" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": { 514 | "collapsed": false 515 | }, 516 | "outputs": [], 517 | "source": [ 518 | "garip = getFacultyInfo(profile_contents['garip'])\n", 519 | "garip_te = getTitleAndEducation(garip)\n", 520 | "print(garip_te[0], garip_te[1])" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "metadata": { 527 | "collapsed": false 528 | }, 529 | "outputs": [], 530 | "source": [ 531 | "garip" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": null, 537 | "metadata": { 538 | "collapsed": false 539 | }, 540 | "outputs": [], 541 | "source": [ 542 | "import string\n", 543 | "\n", 544 | "def getTitleAndEducation2(info):\n", 545 | " info_refined = info.findAll('h4')\n", 546 | " titles = info_refined[0].text\n", 547 | " titles = ''.join(x for x in titles if x not in string.punctuation)\n", 548 | " title_and_education = titles.split('PhD')\n", 549 | " title = title_and_education[0].rstrip()\n", 550 | " education = 'PhD'+title_and_education[1]\n", 551 | " education = education.split('Curriculum')[0].rstrip() #removing additional info and whitespace\n", 552 | " return title, education" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "metadata": { 559 | "collapsed": false 560 | }, 561 | "outputs": [], 562 | "source": [ 563 | "getTitleAndEducation2(garip)" 564 | ] 565 | }, 566 | { 567 | "cell_type": "markdown", 568 | "metadata": {}, 569 | "source": [ 570 | "## Now let's see if that works for all cases" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "metadata": { 577 | "collapsed": false 578 | }, 579 | "outputs": [], 580 | "source": [ 581 | "for prof in profile_contents:\n", 582 | " print(\"Getting info for: \", prof)\n", 583 | " try:\n", 584 | " info = getFacultyInfo(profile_contents[prof])\n", 585 | " te = getTitleAndEducation(info)\n", 586 | " print(prof, te[0], te[1], '\\n')\n", 587 | " except:\n", 588 | " print(\"ERROR: Failed to get info from\", prof)\n", 589 | " sleep(1)" 590 | ] 591 | }, 592 | { 593 | "cell_type": "markdown", 594 | "metadata": {}, 595 | "source": [ 596 | "## OK, so it looks like we got everybody's details except Kim Weeden's. Why? Can you fix the function to get hers too." 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": null, 602 | "metadata": { 603 | "collapsed": true 604 | }, 605 | "outputs": [], 606 | "source": [] 607 | }, 608 | { 609 | "cell_type": "markdown", 610 | "metadata": {}, 611 | "source": [ 612 | "## We should probably get some more information. Complete this function to get the correct name for each faculty member" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": null, 618 | "metadata": { 619 | "collapsed": true 620 | }, 621 | "outputs": [], 622 | "source": [ 623 | "def getFacultyName(soup):\n", 624 | " name_info = soup.findAll('h1', {'class':'entry-title'})\n", 625 | " name = name_info[0].text\n", 626 | " return name" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "metadata": { 633 | "collapsed": false 634 | }, 635 | "outputs": [], 636 | "source": [ 637 | "for prof in profile_contents:\n", 638 | " name = getFacultyName(profile_contents[prof])\n", 639 | " print(name)" 640 | ] 641 | }, 642 | { 643 | "cell_type": "markdown", 644 | "metadata": {}, 645 | "source": [ 646 | "## Now we can put it all together to get a Python object containing info from each page" 647 | ] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": null, 652 | "metadata": { 653 | "collapsed": false 654 | }, 655 | "outputs": [], 656 | "source": [ 657 | "faculty_info = {}\n", 658 | "for prof in profile_contents:\n", 659 | " print(\"Getting info for: \", prof)\n", 660 | " try:\n", 661 | " name = getFacultyName(profile_contents[prof])\n", 662 | " info = getFacultyInfo(profile_contents[prof])\n", 663 | " te = getTitleAndEducation2(info)\n", 664 | " print(te)\n", 665 | " faculty_info[name] = {'title': te[0], 'education':te[1]}\n", 666 | " except:\n", 667 | " print(\"ERROR: Failed to get info from\", prof)\n", 668 | " " 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": null, 674 | "metadata": { 675 | "collapsed": false 676 | }, 677 | "outputs": [], 678 | "source": [ 679 | "faculty_info" 680 | ] 681 | }, 682 | { 683 | "cell_type": "markdown", 684 | "metadata": {}, 685 | "source": [ 686 | "## OK, this looks more ore less correct. Can you see any problems?" 687 | ] 688 | }, 689 | { 690 | "cell_type": "code", 691 | "execution_count": null, 692 | "metadata": { 693 | "collapsed": true 694 | }, 695 | "outputs": [], 696 | "source": [] 697 | }, 698 | { 699 | "cell_type": "markdown", 700 | "metadata": {}, 701 | "source": [ 702 | "## Once you have the information you need its often good to convert it into an easier format to read and to run any analyses on. Here we use pandas to convert it to a dataframe." 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": null, 708 | "metadata": { 709 | "collapsed": true 710 | }, 711 | "outputs": [], 712 | "source": [ 713 | "import pandas as pd\n", 714 | "df = pd.DataFrame.from_dict(faculty_info, orient='index')" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": null, 720 | "metadata": { 721 | "collapsed": false 722 | }, 723 | "outputs": [], 724 | "source": [ 725 | "df" 726 | ] 727 | }, 728 | { 729 | "cell_type": "markdown", 730 | "metadata": { 731 | "collapsed": false 732 | }, 733 | "source": [ 734 | "## You also likely want to save the data somewhere. There are many different ways of doing this, for example in a database, a JSON file, or a csv. Here we use pandas to_csv function to write it to a csv" 735 | ] 736 | }, 737 | { 738 | "cell_type": "code", 739 | "execution_count": null, 740 | "metadata": { 741 | "collapsed": false 742 | }, 743 | "outputs": [], 744 | "source": [ 745 | "df.to_csv('../data/facultyinfo.csv',encoding='utf-8')" 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": null, 751 | "metadata": { 752 | "collapsed": false 753 | }, 754 | "outputs": [], 755 | "source": [] 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": null, 760 | "metadata": { 761 | "collapsed": true 762 | }, 763 | "outputs": [], 764 | "source": [] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "execution_count": null, 769 | "metadata": { 770 | "collapsed": true 771 | }, 772 | "outputs": [], 773 | "source": [] 774 | }, 775 | { 776 | "cell_type": "code", 777 | "execution_count": null, 778 | "metadata": { 779 | "collapsed": true 780 | }, 781 | "outputs": [], 782 | "source": [] 783 | } 784 | ], 785 | "metadata": { 786 | "kernelspec": { 787 | "display_name": "IPython (Python 3)", 788 | "language": "python", 789 | "name": "python3" 790 | }, 791 | "language_info": { 792 | "codemirror_mode": { 793 | "name": "ipython", 794 | "version": 3 795 | }, 796 | "file_extension": ".py", 797 | "mimetype": "text/x-python", 798 | "name": "python", 799 | "nbconvert_exporter": "python", 800 | "pygments_lexer": "ipython3", 801 | "version": "3.5.2" 802 | } 803 | }, 804 | "nbformat": 4, 805 | "nbformat_minor": 1 806 | } 807 | --------------------------------------------------------------------------------