├── data
└── README.md
├── README.md
└── src
├── trump_crawler3.py
├── trump_crawler.py
├── scrape_faculty.py
├── RoParlScraper.py
└── basic_webscraping.ipynb
/data/README.md:
--------------------------------------------------------------------------------
1 | # This folder will contain all data files output by the code we run
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Tutorials for web scraping and crawling
2 |
3 | This repository contains a basic tutorial for scraping and crawling websites using
4 | Python. It also includes some examples of code that has been written to scrape other
5 | websites.
6 |
7 | # Set-up
8 |
9 | You will need to have Python 3.5 installed. As a lab we are moving onto the new version of Python and the tutorials will not work with Python 2. You can download it [here](https://www.python.org/downloads/). You can also download it via [Anaconda](https://www.continuum.io/downloads), which will include many of the packages in the distribution.
10 |
11 | You will also need to have [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) installed. You should make a Github account if you haven’t already.
12 |
13 | You can also consult our [wiki](https://github.com/socdyn/wiki/blob/master/vesta/get_started_with_python.md) for advice on setting this up:
14 |
15 | When you have it installed please install the following packages using `pip3`, e.g `pip3 install jupyter`(if you never installed Python2 then you might be able to just use `pip`). If you are using Anaconda then please try to install the packages with `conda` first and if it doesn't work use `pip3`.
16 |
17 | ```
18 | jupyter
19 | beautifulsoup4
20 | requests
21 | selenium
22 | pandas
23 | json
24 | ```
25 |
26 | You should make sure you have Jupyter notebooks running correctly, you can find information on running it [here](http://jupyter.readthedocs.io/en/latest/content-quickstart.html). You should be able to open a notebook in your browser by typing `jupyter notebooks` into the command line. From here you can then create a Python3 notebook by clicking the ‘New’ tab on the right and selecting ‘Python3’ or ‘IPython3’. You can then import all the packages into the notebook to check they were installed.
27 |
--------------------------------------------------------------------------------
/src/trump_crawler3.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import json
3 | import time
4 | from selenium import webdriver # This code is necessary to pull from Javascript
5 | driver = webdriver.PhantomJS()
6 |
7 | def page_iterator(BASE_URL, NUMBER_OF_PAGES_TRUMP):
8 | for i in range(1, NUMBER_OF_PAGES_TRUMP + 1):
9 | url = BASE_URL + str(i)
10 | yield url
11 | i+=1
12 |
13 | def get_page_content(url):
14 | driver.get(url) # Get the webpage
15 | # Convert it to a BS object - "soup"
16 | soup = BeautifulSoup(driver.page_source, "html.parser")
17 | return soup
18 |
19 | def find_valid_links(soup):
20 | press_release_urls = []
21 | # FINDING AND STORING LINKS TO INDIVIDUAL PRESS RELEASES
22 | for link in soup.findAll('a', href=True):
23 | candidate_link = link['href']
24 | # two simple criteria for determining if this is a press release url
25 | if "press-release" in candidate_link:
26 | if len(candidate_link) > MIN_TRUMP_URL_LEN:
27 | press_release_urls.append(candidate_link)
28 | return press_release_urls
29 |
30 | def process_press_releases(press_release_urls, VISITED_URLS):
31 | for pr_url in press_release_urls:
32 | if pr_url not in VISITED_URLS:
33 | time.sleep(1) # limit calls to 1 per second
34 |
35 | soup = get_page_content(pr_url)
36 | content = soup.find_all('p') #gets all objects with 'p' html tag
37 | paragraphs = []
38 | for c in content:
39 | c_text = c.getText()
40 | paragraphs.append(c_text)
41 | # we don't need the first or last 5 elements
42 | # so we slice them out (this is through trial and error)
43 | trimmed_paragraphs = paragraphs[1:-5]
44 | press_release_text = "".join(trimmed_paragraphs)
45 |
46 | # CREATING DICTIONARY
47 | press_release_dict = {
48 | "text": press_release_text,
49 | "url": pr_url,
50 | "author": "Trump",
51 | }
52 | VISITED_URLS.add(pr_url)
53 | yield press_release_dict
54 |
55 | def write_to_json(press_release_dict):
56 | with open(OUTPUT_PATH, 'a') as f:
57 | # turns dict into valid json string on 1 line
58 | j = json.dumps(press_release_dict) + '\n'
59 | # writes j to file f
60 | f.write(j)
61 |
62 | if __name__ == '__main__':
63 |
64 | # Website url
65 | BASE_URL = "https://www.donaldjtrump.com/press-releases/P"
66 |
67 | # Constants
68 | NUMBER_OF_PAGES_TRUMP = 1000
69 |
70 | # Min length of a valid press release url
71 | MIN_TRUMP_URL_LEN = 50
72 |
73 | # Where we save the data output
74 | OUTPUT_PATH = '../data/trump_website.json'
75 |
76 | # This set will contain all visited press release urls
77 | VISITED_URLS = set()
78 |
79 | pages = page_iterator(BASE_URL, NUMBER_OF_PAGES_TRUMP)
80 | for p in pages:
81 | soup = get_page_content(p)
82 | prs = find_valid_links(soup)
83 | processed_prs = process_press_releases(prs, VISITED_URLS)
84 | for pprs in processed_prs:
85 | print(pprs['text'])
86 | write_to_json(pprs)
87 |
--------------------------------------------------------------------------------
/src/trump_crawler.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | from urllib2 import urlopen
3 | import json
4 | import time
5 |
6 | # This code is necessary to pull from Javascript
7 | from selenium import webdriver
8 | driver = webdriver.PhantomJS()
9 |
10 | # Website url
11 | base_url_trump = "https://www.donaldjtrump.com/press-releases/P"
12 |
13 | # Constants
14 | # Number of pages of press releases
15 | NUMBER_OF_PAGES_TRUMP = 90
16 | # Min length of a valid press release url
17 | MIN_TRUMP_URL_LEN = 50
18 |
19 | # Where we save the data output
20 | OUTPUT_PATH = '../data/trump_website.json'
21 |
22 | # This set will contain all visited press release urls
23 | press_release_url_set = set()
24 |
25 | # Main Body
26 |
27 | """
28 | The intuition behind this is simple:
29 | 1) We get the press release urls from each page of press releases
30 | 2) We then go through the press release urls and get press release text
31 | 3) We append them to a newline-delimited json file
32 | """
33 |
34 | # CRAWLING THROUGH EACH PAGE OF PRESS RELEASES
35 | for i in range(1, NUMBER_OF_PAGES_TRUMP + 1):
36 |
37 | press_release_urls = [] #empty list to store urls
38 |
39 | # READING EACH WEBPAGE
40 | url = base_url_trump + str(i) # Concatenate url with index value
41 | #page = urlopen(url).read() # Demo these two lines
42 | #soup = BeautifulSoup(page)
43 | driver.get(url) # Get the webpage
44 | # Convert it to a BS object - "soup"
45 | soup = BeautifulSoup(driver.page_source)
46 |
47 | # FINDING AND STORING LINKS TO INDIVIDUAL PRESS RELEASES
48 | for link in soup.findAll('a', href=True):
49 | candidate_link = link['href']
50 | # two simple criteria for determining if this is a press release url
51 | if "press-release" in candidate_link:
52 | if len(candidate_link) > MIN_TRUMP_URL_LEN:
53 | press_release_urls.append(candidate_link)
54 |
55 | #PROCESSING PRESS RELEASES
56 | for pr_url in press_release_urls:
57 | if pr_url not in press_release_url_set:
58 | time.sleep(1) # limit calls to 1 per second
59 | press_release_url_set.add(pr_url)
60 | driver.get(pr_url)
61 | soup = BeautifulSoup(driver.page_source)
62 | content = soup.find_all('p')
63 | print (
64 | "START OF NEW PRESS RELEASE WITH LENGTH {}!".format(len(content))
65 | )
66 | paragraphs = []
67 | for c in content:
68 | c_text = c.getText()
69 | paragraphs.append(c_text)
70 | # we don't need the first or last 5 elements
71 | # so we slice them out
72 | trimmed_paragraphs = paragraphs[1:-5]
73 | press_release_text = "".join(trimmed_paragraphs)
74 |
75 | # CREATING DICTIONARY
76 | press_release_dict = {
77 | "text": press_release_text,
78 | "url": pr_url,
79 | "author": "Trump",
80 | }
81 |
82 | # WRITING DICTIONARY TO JSON
83 | with open(OUTPUT_PATH, 'a') as f:
84 | # turns dict into valid json string on 1 line
85 | j = json.dumps(press_release_dict) + '\n'
86 | # writes j to file f
87 | f.write(j)
88 |
89 | i+=1 # incrementing index by 1
90 |
--------------------------------------------------------------------------------
/src/scrape_faculty.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import string
3 | from bs4 import BeautifulSoup as BS
4 | from time import sleep
5 | import pandas as pd
6 |
7 | def getSoup(url):
8 | """Input: A valid url.
9 | Gets the HTML associated with a url and
10 | converts it to a BeautifulSoup object.
11 | Then sleeps for a short time.
12 | Returns: A BeautifulSoup object."""
13 | html = requests.get(url)
14 | soup = BS(html.content, "html.parser")
15 | sleep(6)
16 | return soup
17 |
18 | def getFacultyPages(base_url):
19 | """Input: The base url for the
20 | Cornell sociology faculty site.
21 | Finds all valid links to faculty profiles
22 | then visits each link and gets the soup
23 | object.
24 | Returns: An iterator of faculty profiles
25 | soup objects"""
26 | soup = getSoup(base_url)
27 | links = soup.findAll('a', href=True)
28 | profiles = []
29 | for l in links:
30 | if "/people/faculty/" in l['href']:
31 | profiles.append(l['href'])
32 | profiles = [x for x in profiles if x.endswith('faculty/') == False]
33 | profiles = set(profiles)
34 | for p in profiles:
35 | yield getSoup(p)
36 |
37 | def getFacultyInfo(soup):
38 | """Input: A soup object related to a faculty webpage.
39 | Finds the section of the page with the class
40 | entry-content, based on inspection of relevant pages,
41 | Returns: A soup object with the relevant content."""
42 | info = soup.find('div', {'class': 'entry-content'})
43 | return info
44 |
45 | def getTitleAndEducation(info):
46 | """Input: A soup object output by getFacultyInfo()
47 | that contains information on a faculty member.
48 | This function parses this object to get the title
49 | and education for a given faculty member.
50 | Reutrns: A tuple containing title and education
51 | strings."""
52 | info_refined = info.findAll('h4')
53 | titles = info_refined[0].text
54 | title_and_education = titles.split('PhD')
55 | title = title_and_education[0]
56 | title = ''.join(x for x in title if x not in string.punctuation)
57 | title = title.rstrip()
58 | education = 'PhD'+title_and_education[1]
59 | education = education.split('Curriculum')[0].rstrip()
60 | return title, education
61 |
62 | def getFacultyName(soup):
63 | """Input: The soup object for a faculty page.
64 | Gets the faculty members name as it appears on the page.
65 | Returns: The name as a string."""
66 | name_info = soup.findAll('h1', {'class':'entry-title'})
67 | name = name_info[0].text
68 | return name
69 |
70 | if __name__ == '__main__':
71 | URL = "http://www.soc.cornell.edu/people/faculty/"
72 | faculty_pages= getFacultyPages(URL)
73 | faculty_info = {}
74 | for fp in faculty_pages:
75 | name = getFacultyName(fp)
76 | print("Getting information for ", name)
77 | try:
78 | info = getFacultyInfo(fp)
79 | title, education = getTitleAndEducation(info)
80 | print("Information obtained for ", name)
81 | except:
82 | print("Failed to get info from page for ", name)
83 | title, education = None, None
84 | faculty_info[name] = {'title':title, 'education':education}
85 | print(faculty_info)
86 | df = pd.DataFrame.from_dict(faculty_info, orient='index')
87 | df.to_csv('../data/facultyinfo2.csv',encoding='utf-8')
88 |
--------------------------------------------------------------------------------
/src/RoParlScraper.py:
--------------------------------------------------------------------------------
1 | #Started April 18, 2016, last edit May 9, 2016
2 | #Radu Parvulescu
3 |
4 | """ Scrapes and stores html of transcripts from Romanian Chamber of Deputies. """
5 |
6 | import time
7 | import random
8 | import requests
9 | import codecs
10 | import json
11 | import os
12 | import re
13 | import natsort
14 | from bs4 import BeautifulSoup
15 | import glob
16 |
17 | def Scarpe():
18 | """ Scrapes the html of transcripts of Lower House sessions (LH only or joint meetings with Senate). """
19 |
20 | for x in range(1,10):
21 | url = 'http://www.cdep.ro/pls/steno/steno2015.stenograma?ids=' + str(x) + '&idl=1'
22 |
23 | #for transparent research, tell sysadmin who I am
24 | headers = {'user-agent' : 'Mozilla 45.0 (Linux Mint 17); Radu Parvulescu/Cornell University/rap348@cornell.edu'}
25 |
26 | #stagger requests so we don't overload target site
27 | time.sleep(random.uniform(0.1,1))
28 |
29 | #get html text and transmit header
30 | html = requests.get(url, headers=headers).text
31 |
32 | #get date-time you retrieved document
33 | retrieved = time.time()
34 |
35 | #dump into dictionary
36 | data = {'number':x, 'html':html, 'retrieved':retrieved, 'url':url}
37 |
38 | #dump in file, line-delimited json
39 | out_file_name = '../data/'+str(x) + '.txt'
40 | with codecs.open(out_file_name, 'w', encoding='UTF8') as outfile:
41 | outfile.write(json.dumps(data))
42 | outfile.write('\n')
43 |
44 | #tells me where I am
45 | print x
46 |
47 | def ExtractSpeech():
48 |
49 | """ Extracts the speech and vital information from the html.
50 |
51 | PreC: Reads line-delimited json .txt files in current folder. """
52 |
53 | #loop over html files in directory, in natural order
54 | #for FILE in natsort.natsorted(os.listdir('../data')):
55 | for FILE in natsort.natsorted(glob.glob("../data/*.txt")):
56 |
57 | #tell me where I am
58 | print FILE
59 |
60 | if FILE.endswith('.txt'):
61 | with codecs.open(FILE, 'r', encoding='UTF8') as in_file:
62 | for line in in_file:
63 | dictio = json.loads(line)
64 | #take out HTML
65 | html = dictio['html']
66 |
67 | #this is how website indexes beginning and end of
68 | #speeches. Extract speeches and get date of debate
69 | speeches = re.findall(r'', html, re.DOTALL)
70 | #ignore pages with no speeches, make year-month-day
71 | #title of debate
72 | if len(speeches) > 0:
73 | date = re.findall(r'
(.*?)', html, re.DOTALL)[0]
74 | #joint sessions of lower & upper houses don't
75 | #give the date, for some strange reason
76 | if 'din ' in date:
77 | date = date.partition('din ')[2]
78 | date = re.sub(r'\s+', ' ', date)
79 | date = re.split(' ', date)
80 | date = date[2] + '-' + date[1] + '-' + date[0]
81 | #dump in files
82 | out_file_name = '../data/debate'+ '_' + str(date) + '.txt'
83 |
84 | else:
85 | date = 'Joint Session, raw file ' + str(FILE)
86 | #dump in files
87 | out_file_name = '../data/debate'+ '_' + str(date)
88 |
89 |
90 | with codecs.open(out_file_name, 'w', encoding='UTF8') as outfile:
91 | outfile.write(date)
92 | outfile.write('\n')
93 |
94 | #iterate over speeches
95 | for s in speeches:
96 | text = s.partition('-->\n')[2]
97 | soup = BeautifulSoup(text, 'lxml')
98 | speech = soup.get_text()
99 | outfile.write(speech)
100 |
101 |
102 | if __name__ == '__main__':
103 | Scarpe()
104 | ExtractSpeech()
105 |
106 | #instead of codecs try using io package -- 'io.open'
107 |
--------------------------------------------------------------------------------
/src/basic_webscraping.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# This tutorial will show you how to scrape the web using Python"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## The task is to get information about every faculty member in sociology from their department profiles. We must begin with a base URL from which we can access the profiles"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "URL = \"http://www.soc.cornell.edu/people/faculty/\""
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "## There are a couple of packages we need to import"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {
39 | "collapsed": true
40 | },
41 | "outputs": [],
42 | "source": [
43 | "import requests\n",
44 | "from bs4 import BeautifulSoup as BS"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {
51 | "collapsed": true
52 | },
53 | "outputs": [],
54 | "source": [
55 | "html = requests.get(URL)"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {
62 | "collapsed": false
63 | },
64 | "outputs": [],
65 | "source": [
66 | "html.content"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {
73 | "collapsed": false
74 | },
75 | "outputs": [],
76 | "source": [
77 | "soup = BS(html.content, \"html.parser\")"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": null,
83 | "metadata": {
84 | "collapsed": false
85 | },
86 | "outputs": [],
87 | "source": []
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "## We'll need to do these steps quite a lot so its useful to abstract it with a function "
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {
100 | "collapsed": true
101 | },
102 | "outputs": [],
103 | "source": [
104 | "def getSoup(url):\n",
105 | " html = requests.get(url)\n",
106 | " soup = BS(html.content, \"html.parser\")\n",
107 | " return soup"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {},
113 | "source": [
114 | "## BeautifulSoup provides some useful functions to parse the raw html"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {
121 | "collapsed": true
122 | },
123 | "outputs": [],
124 | "source": [
125 | "links = soup.findAll('a', href=True) #Finds all 'a' tags with an href object (i.e. all hyperlinks)"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "metadata": {
132 | "collapsed": false
133 | },
134 | "outputs": [],
135 | "source": [
136 | "links"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {
143 | "collapsed": false
144 | },
145 | "outputs": [],
146 | "source": [
147 | "#Let's take a look at one of these items \n",
148 | "links[20]"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {
155 | "collapsed": false
156 | },
157 | "outputs": [],
158 | "source": [
159 | "type(links[20])"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {
166 | "collapsed": false
167 | },
168 | "outputs": [],
169 | "source": [
170 | "dir(links[20])"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {
177 | "collapsed": false
178 | },
179 | "outputs": [],
180 | "source": [
181 | "x = links[20]"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": null,
187 | "metadata": {
188 | "collapsed": false
189 | },
190 | "outputs": [],
191 | "source": [
192 | "x.contents"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {
199 | "collapsed": false
200 | },
201 | "outputs": [],
202 | "source": [
203 | "x['href']"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "## After experimenting with the object and determining what we want, we can then loop through all the objects returned by the query"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "metadata": {
217 | "collapsed": true
218 | },
219 | "outputs": [],
220 | "source": [
221 | "profiles = []\n",
222 | "for l in links:\n",
223 | " if \"/people/faculty/\" in l['href']:\n",
224 | " profiles.append(l['href'])"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {
231 | "collapsed": false
232 | },
233 | "outputs": [],
234 | "source": [
235 | "profiles"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "metadata": {
242 | "collapsed": false
243 | },
244 | "outputs": [],
245 | "source": [
246 | "##We can remove the incorrect links by applying a conditional filter to profiles\n",
247 | "profiles = [x for x in profiles if x.endswith('faculty/') == False]"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "metadata": {
254 | "collapsed": false
255 | },
256 | "outputs": [],
257 | "source": [
258 | "profiles"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": null,
264 | "metadata": {
265 | "collapsed": false
266 | },
267 | "outputs": [],
268 | "source": [
269 | "#Note that there are many duplicates in the list...\n",
270 | "print(len(profiles))\n",
271 | "print(len(set(profiles)))"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": null,
277 | "metadata": {
278 | "collapsed": false
279 | },
280 | "outputs": [],
281 | "source": [
282 | "profiles = list(set(profiles))"
283 | ]
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "metadata": {},
288 | "source": [
289 | "## Now we have a list of URLs we can retrieve the information from each by looping through the list and applying the function we created. The results can be saved in a dictionary."
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {
296 | "collapsed": false
297 | },
298 | "outputs": [],
299 | "source": [
300 | "from time import sleep\n",
301 | "profile_contents = {}\n",
302 | "for p in profiles:\n",
303 | " print(\"Getting information from: \", p)\n",
304 | " sleep(1) #Sleeping for a time interval so we're not querying too frequently\n",
305 | " soup = getSoup(p)\n",
306 | " name = p.split('/')[-2]\n",
307 | " profile_contents[name] = soup"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {
314 | "collapsed": false
315 | },
316 | "outputs": [],
317 | "source": [
318 | "print(profile_contents.keys())"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "metadata": {
325 | "collapsed": false
326 | },
327 | "outputs": [],
328 | "source": [
329 | "#If we want to get the information for a particular professor we can look up their dictionary entry\n",
330 | "macy = profile_contents['macy']\n",
331 | "macy"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": null,
337 | "metadata": {
338 | "collapsed": false
339 | },
340 | "outputs": [],
341 | "source": [
342 | "macy.find('div', {'class': 'entry-content'})"
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": null,
348 | "metadata": {
349 | "collapsed": false
350 | },
351 | "outputs": [],
352 | "source": [
353 | "content = macy.find('div', {'class': 'entry-content'})\n",
354 | "content.text"
355 | ]
356 | },
357 | {
358 | "cell_type": "code",
359 | "execution_count": null,
360 | "metadata": {
361 | "collapsed": false
362 | },
363 | "outputs": [],
364 | "source": [
365 | "content_refined = content.findAll('h4')"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": null,
371 | "metadata": {
372 | "collapsed": false
373 | },
374 | "outputs": [],
375 | "source": [
376 | "content_refined[0]"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": null,
382 | "metadata": {
383 | "collapsed": false
384 | },
385 | "outputs": [],
386 | "source": [
387 | "titles = content_refined[0].text"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": null,
393 | "metadata": {
394 | "collapsed": false
395 | },
396 | "outputs": [],
397 | "source": [
398 | "titles.split('PhD')"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": null,
404 | "metadata": {
405 | "collapsed": true
406 | },
407 | "outputs": [],
408 | "source": [
409 | "title_and_education = titles.split('PhD')"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": null,
415 | "metadata": {
416 | "collapsed": true
417 | },
418 | "outputs": [],
419 | "source": [
420 | "title = title_and_education[0]\n",
421 | "education = title_and_education[1]\n",
422 | "education = 'PhD'+education"
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": null,
428 | "metadata": {
429 | "collapsed": false
430 | },
431 | "outputs": [],
432 | "source": [
433 | "title"
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": null,
439 | "metadata": {
440 | "collapsed": false
441 | },
442 | "outputs": [],
443 | "source": [
444 | "education"
445 | ]
446 | },
447 | {
448 | "cell_type": "markdown",
449 | "metadata": {},
450 | "source": [
451 | "## Let's tidy that up and make some functions we can reuse"
452 | ]
453 | },
454 | {
455 | "cell_type": "code",
456 | "execution_count": null,
457 | "metadata": {
458 | "collapsed": true
459 | },
460 | "outputs": [],
461 | "source": [
462 | "def getFacultyInfo(soup):\n",
463 | " info = soup.find('div', {'class': 'entry-content'})\n",
464 | " return info"
465 | ]
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": null,
470 | "metadata": {
471 | "collapsed": true
472 | },
473 | "outputs": [],
474 | "source": [
475 | "def getTitleAndEducation(info):\n",
476 | " info_refined = info.findAll('h4')\n",
477 | " titles = info_refined[0].text\n",
478 | " title_and_education = titles.split('PhD')\n",
479 | " title = title_and_education[0]\n",
480 | " education = 'PhD'+title_and_education[1]\n",
481 | " return title, education"
482 | ]
483 | },
484 | {
485 | "cell_type": "code",
486 | "execution_count": null,
487 | "metadata": {
488 | "collapsed": false
489 | },
490 | "outputs": [],
491 | "source": [
492 | "macy = getFacultyInfo(profile_contents['macy'])\n",
493 | "macy_te = getTitleAndEducation(macy)\n",
494 | "print(macy_te[0], macy_te[1])"
495 | ]
496 | },
497 | {
498 | "cell_type": "code",
499 | "execution_count": null,
500 | "metadata": {
501 | "collapsed": false
502 | },
503 | "outputs": [],
504 | "source": [
505 | "heckathorn = getFacultyInfo(profile_contents['heckathorn'])\n",
506 | "heckathorn_te = getTitleAndEducation(heckathorn)\n",
507 | "print(heckathorn_te[0], heckathorn_te[1])"
508 | ]
509 | },
510 | {
511 | "cell_type": "code",
512 | "execution_count": null,
513 | "metadata": {
514 | "collapsed": false
515 | },
516 | "outputs": [],
517 | "source": [
518 | "garip = getFacultyInfo(profile_contents['garip'])\n",
519 | "garip_te = getTitleAndEducation(garip)\n",
520 | "print(garip_te[0], garip_te[1])"
521 | ]
522 | },
523 | {
524 | "cell_type": "code",
525 | "execution_count": null,
526 | "metadata": {
527 | "collapsed": false
528 | },
529 | "outputs": [],
530 | "source": [
531 | "garip"
532 | ]
533 | },
534 | {
535 | "cell_type": "code",
536 | "execution_count": null,
537 | "metadata": {
538 | "collapsed": false
539 | },
540 | "outputs": [],
541 | "source": [
542 | "import string\n",
543 | "\n",
544 | "def getTitleAndEducation2(info):\n",
545 | " info_refined = info.findAll('h4')\n",
546 | " titles = info_refined[0].text\n",
547 | " titles = ''.join(x for x in titles if x not in string.punctuation)\n",
548 | " title_and_education = titles.split('PhD')\n",
549 | " title = title_and_education[0].rstrip()\n",
550 | " education = 'PhD'+title_and_education[1]\n",
551 | " education = education.split('Curriculum')[0].rstrip() #removing additional info and whitespace\n",
552 | " return title, education"
553 | ]
554 | },
555 | {
556 | "cell_type": "code",
557 | "execution_count": null,
558 | "metadata": {
559 | "collapsed": false
560 | },
561 | "outputs": [],
562 | "source": [
563 | "getTitleAndEducation2(garip)"
564 | ]
565 | },
566 | {
567 | "cell_type": "markdown",
568 | "metadata": {},
569 | "source": [
570 | "## Now let's see if that works for all cases"
571 | ]
572 | },
573 | {
574 | "cell_type": "code",
575 | "execution_count": null,
576 | "metadata": {
577 | "collapsed": false
578 | },
579 | "outputs": [],
580 | "source": [
581 | "for prof in profile_contents:\n",
582 | " print(\"Getting info for: \", prof)\n",
583 | " try:\n",
584 | " info = getFacultyInfo(profile_contents[prof])\n",
585 | " te = getTitleAndEducation(info)\n",
586 | " print(prof, te[0], te[1], '\\n')\n",
587 | " except:\n",
588 | " print(\"ERROR: Failed to get info from\", prof)\n",
589 | " sleep(1)"
590 | ]
591 | },
592 | {
593 | "cell_type": "markdown",
594 | "metadata": {},
595 | "source": [
596 | "## OK, so it looks like we got everybody's details except Kim Weeden's. Why? Can you fix the function to get hers too."
597 | ]
598 | },
599 | {
600 | "cell_type": "code",
601 | "execution_count": null,
602 | "metadata": {
603 | "collapsed": true
604 | },
605 | "outputs": [],
606 | "source": []
607 | },
608 | {
609 | "cell_type": "markdown",
610 | "metadata": {},
611 | "source": [
612 | "## We should probably get some more information. Complete this function to get the correct name for each faculty member"
613 | ]
614 | },
615 | {
616 | "cell_type": "code",
617 | "execution_count": null,
618 | "metadata": {
619 | "collapsed": true
620 | },
621 | "outputs": [],
622 | "source": [
623 | "def getFacultyName(soup):\n",
624 | " name_info = soup.findAll('h1', {'class':'entry-title'})\n",
625 | " name = name_info[0].text\n",
626 | " return name"
627 | ]
628 | },
629 | {
630 | "cell_type": "code",
631 | "execution_count": null,
632 | "metadata": {
633 | "collapsed": false
634 | },
635 | "outputs": [],
636 | "source": [
637 | "for prof in profile_contents:\n",
638 | " name = getFacultyName(profile_contents[prof])\n",
639 | " print(name)"
640 | ]
641 | },
642 | {
643 | "cell_type": "markdown",
644 | "metadata": {},
645 | "source": [
646 | "## Now we can put it all together to get a Python object containing info from each page"
647 | ]
648 | },
649 | {
650 | "cell_type": "code",
651 | "execution_count": null,
652 | "metadata": {
653 | "collapsed": false
654 | },
655 | "outputs": [],
656 | "source": [
657 | "faculty_info = {}\n",
658 | "for prof in profile_contents:\n",
659 | " print(\"Getting info for: \", prof)\n",
660 | " try:\n",
661 | " name = getFacultyName(profile_contents[prof])\n",
662 | " info = getFacultyInfo(profile_contents[prof])\n",
663 | " te = getTitleAndEducation2(info)\n",
664 | " print(te)\n",
665 | " faculty_info[name] = {'title': te[0], 'education':te[1]}\n",
666 | " except:\n",
667 | " print(\"ERROR: Failed to get info from\", prof)\n",
668 | " "
669 | ]
670 | },
671 | {
672 | "cell_type": "code",
673 | "execution_count": null,
674 | "metadata": {
675 | "collapsed": false
676 | },
677 | "outputs": [],
678 | "source": [
679 | "faculty_info"
680 | ]
681 | },
682 | {
683 | "cell_type": "markdown",
684 | "metadata": {},
685 | "source": [
686 | "## OK, this looks more ore less correct. Can you see any problems?"
687 | ]
688 | },
689 | {
690 | "cell_type": "code",
691 | "execution_count": null,
692 | "metadata": {
693 | "collapsed": true
694 | },
695 | "outputs": [],
696 | "source": []
697 | },
698 | {
699 | "cell_type": "markdown",
700 | "metadata": {},
701 | "source": [
702 | "## Once you have the information you need its often good to convert it into an easier format to read and to run any analyses on. Here we use pandas to convert it to a dataframe."
703 | ]
704 | },
705 | {
706 | "cell_type": "code",
707 | "execution_count": null,
708 | "metadata": {
709 | "collapsed": true
710 | },
711 | "outputs": [],
712 | "source": [
713 | "import pandas as pd\n",
714 | "df = pd.DataFrame.from_dict(faculty_info, orient='index')"
715 | ]
716 | },
717 | {
718 | "cell_type": "code",
719 | "execution_count": null,
720 | "metadata": {
721 | "collapsed": false
722 | },
723 | "outputs": [],
724 | "source": [
725 | "df"
726 | ]
727 | },
728 | {
729 | "cell_type": "markdown",
730 | "metadata": {
731 | "collapsed": false
732 | },
733 | "source": [
734 | "## You also likely want to save the data somewhere. There are many different ways of doing this, for example in a database, a JSON file, or a csv. Here we use pandas to_csv function to write it to a csv"
735 | ]
736 | },
737 | {
738 | "cell_type": "code",
739 | "execution_count": null,
740 | "metadata": {
741 | "collapsed": false
742 | },
743 | "outputs": [],
744 | "source": [
745 | "df.to_csv('../data/facultyinfo.csv',encoding='utf-8')"
746 | ]
747 | },
748 | {
749 | "cell_type": "code",
750 | "execution_count": null,
751 | "metadata": {
752 | "collapsed": false
753 | },
754 | "outputs": [],
755 | "source": []
756 | },
757 | {
758 | "cell_type": "code",
759 | "execution_count": null,
760 | "metadata": {
761 | "collapsed": true
762 | },
763 | "outputs": [],
764 | "source": []
765 | },
766 | {
767 | "cell_type": "code",
768 | "execution_count": null,
769 | "metadata": {
770 | "collapsed": true
771 | },
772 | "outputs": [],
773 | "source": []
774 | },
775 | {
776 | "cell_type": "code",
777 | "execution_count": null,
778 | "metadata": {
779 | "collapsed": true
780 | },
781 | "outputs": [],
782 | "source": []
783 | }
784 | ],
785 | "metadata": {
786 | "kernelspec": {
787 | "display_name": "IPython (Python 3)",
788 | "language": "python",
789 | "name": "python3"
790 | },
791 | "language_info": {
792 | "codemirror_mode": {
793 | "name": "ipython",
794 | "version": 3
795 | },
796 | "file_extension": ".py",
797 | "mimetype": "text/x-python",
798 | "name": "python",
799 | "nbconvert_exporter": "python",
800 | "pygments_lexer": "ipython3",
801 | "version": "3.5.2"
802 | }
803 | },
804 | "nbformat": 4,
805 | "nbformat_minor": 1
806 | }
807 |
--------------------------------------------------------------------------------