├── docs
    ├── requirements.txt
    ├── pt4.md
    ├── pt9.md
    ├── pt3.md
    ├── pt8.md
    ├── pt10.md
    ├── pt5.md
    ├── next_steps.md
    ├── pt7.md
    ├── pt6.md
    ├── pt2.md
    ├── pt1.md
    └── index.md
├── site
    ├── requirements.txt
    ├── img
    │   └── favicon.ico
    ├── fonts
    │   ├── fontawesome-webfont.eot
    │   ├── fontawesome-webfont.ttf
    │   └── fontawesome-webfont.woff
    ├── mkdocs
    │   └── js
    │   │   ├── search-results-template.mustache
    │   │   ├── search.js
    │   │   └── mustache.min.js
    ├── license
    │   └── highlight.js
    │   │   └── LICENSE
    ├── js
    │   └── theme.js
    ├── css
    │   ├── highlight.css
    │   └── theme_extra.css
    ├── sitemap.xml
    ├── search.html
    ├── pt4
    │   └── index.html
    ├── pt9
    │   └── index.html
    └── pt3
    │   └── index.html
├── 6_from_apis
    ├── crime.db
    ├── crime_backup.json.zip
    ├── fun_with_datetime.py
    ├── fun_with_sqlite.py
    ├── completed
    │   ├── fun_with_datetime_done.py
    │   ├── fun_with_sqlite_done.py
    │   └── API_done.py
    └── API.py
├── 8_cleaning
    ├── Candidates.xlsx
    ├── Candidates_backup.xlsx
    ├── fun_with_excel.py
    ├── clean_csv.py
    ├── completed
    │   ├── fun_with_excel_done.py
    │   ├── clean_csv_done.py
    │   ├── excel_done.py
    │   └── names_done.py
    ├── excel.py
    └── names.py
├── 10_encoding_debugging
    ├── some_text.txt
    ├── broken_code.png
    ├── encoding.py
    └── completed
    │   └── encoding_done.py
├── 4_make_function
    ├── call_function.py
    ├── completed
    │   ├── call_function_done.py
    │   └── payday_parser_done.py
    └── payday_parser.py
├── requirements.txt
├── requirements-win.txt
├── .gitattributes
├── 2_web_scrape
    ├── table_example.html
    ├── fun_with_csv.py
    ├── fun_with_bs.py
    ├── completed
    │   ├── fun_with_csv_done.py
    │   ├── fun_with_bs_done.py
    │   ├── scrape_done.py
    │   ├── scrape_pt2_done.py
    │   └── fun_with_regex_done.py
    ├── scrape.py
    ├── scrape_pt2.py
    └── fun_with_regex.py
├── .gitignore
├── README.md
├── mkdocs.yml
├── 1_intro
    ├── fun_with_subs.py
    ├── var.py
    ├── completed
    │   ├── fun_with_subs_done.py
    │   └── exercises_done.py
    ├── whitespace.py
    └── exercises.py
├── LICENSE.txt
├── 5_geocode
    ├── geocode.py
    └── completed
    │   └── geocode_done.py
├── 9_other_scrapes
    ├── other_scrapes.py
    ├── completed
    │   ├── other_scrapes_done.py
    │   └── other_scrapes_post_done.py
    └── other_scrapes_post.py
├── 7_out_of_dbs
    ├── chicago_crime.html
    ├── to_json.py
    └── completed
    │   └── to_json_done.py
└── 3_parse_addresses
    ├── payday.py
    └── completed
        └── payday_done.py


/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/site/requirements.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/6_from_apis/crime.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ireapps/coding-for-journalists/HEAD/6_from_apis/crime.db


--------------------------------------------------------------------------------
/site/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ireapps/coding-for-journalists/HEAD/site/img/favicon.ico


--------------------------------------------------------------------------------
/8_cleaning/Candidates.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ireapps/coding-for-journalists/HEAD/8_cleaning/Candidates.xlsx


--------------------------------------------------------------------------------
/6_from_apis/crime_backup.json.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ireapps/coding-for-journalists/HEAD/6_from_apis/crime_backup.json.zip


--------------------------------------------------------------------------------
/8_cleaning/Candidates_backup.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ireapps/coding-for-journalists/HEAD/8_cleaning/Candidates_backup.xlsx


--------------------------------------------------------------------------------
/10_encoding_debugging/some_text.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ireapps/coding-for-journalists/HEAD/10_encoding_debugging/some_text.txt


--------------------------------------------------------------------------------
/site/fonts/fontawesome-webfont.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ireapps/coding-for-journalists/HEAD/site/fonts/fontawesome-webfont.eot


--------------------------------------------------------------------------------
/site/fonts/fontawesome-webfont.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ireapps/coding-for-journalists/HEAD/site/fonts/fontawesome-webfont.ttf


--------------------------------------------------------------------------------
/site/fonts/fontawesome-webfont.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ireapps/coding-for-journalists/HEAD/site/fonts/fontawesome-webfont.woff


--------------------------------------------------------------------------------
/10_encoding_debugging/broken_code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ireapps/coding-for-journalists/HEAD/10_encoding_debugging/broken_code.png


--------------------------------------------------------------------------------
/site/mkdocs/js/search-results-template.mustache:
--------------------------------------------------------------------------------
1 | <article>
2 |   <h3><a href="{{location}}">{{title}}</a></h3>
3 |   <p>{{summary}}</p>
4 | </article>
5 | 


--------------------------------------------------------------------------------
/4_make_function/call_function.py:
--------------------------------------------------------------------------------
1 | # Let's import our ParseMyAddresses function from payday_parser.py
2 | 
3 | # Run the function on the consumer_installment HTML file.
4 | 


--------------------------------------------------------------------------------
/4_make_function/completed/call_function_done.py:
--------------------------------------------------------------------------------
1 | # Let's import our ParseMyAddresses function from payday_parser.py
2 | from payday_parser_done import ParseMyAddresses
3 | 
4 | # Run the function on the consumer_installment HTML file.
5 | ParseMyAddresses('consumer_installment.html')
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | backports.ssl-match-hostname==3.4.0.2
 2 | beautifulsoup4==4.3.2
 3 | certifi==2015.4.28
 4 | click==4.0
 5 | et-xmlfile==1.0.1
 6 | geopy==1.10.0
 7 | gnureadline==6.3.3
 8 | ipython==3.1.0
 9 | jdcal>=1.0
10 | Jinja2==2.7.3
11 | livereload==2.4.0
12 | Markdown==2.6.2
13 | MarkupSafe==0.23
14 | mkdocs==0.14.0
15 | openpyxl==2.3.0
16 | PyYAML==3.11
17 | requests==2.7.0
18 | six==1.9.0
19 | tornado==4.2
20 | 


--------------------------------------------------------------------------------
/requirements-win.txt:
--------------------------------------------------------------------------------
 1 | backports.ssl-match-hostname==3.4.0.2
 2 | beautifulsoup4==4.3.2
 3 | certifi==2015.4.28
 4 | click==4.0
 5 | et-xmlfile==1.0.1
 6 | geopy==1.10.0
 7 | ipython==3.1.0
 8 | jdcal>=1.0
 9 | Jinja2==2.7.3
10 | livereload==2.4.0
11 | Markdown==2.6.2
12 | MarkupSafe==0.23
13 | mkdocs==0.14.0
14 | openpyxl==2.3.0
15 | pyreadline==2.1
16 | PyYAML==3.11
17 | requests==2.7.0
18 | six==1.9.0
19 | tornado==4.2
20 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/2_web_scrape/table_example.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | 	<body>
 3 | 		<table border=1 cellpadding=5>
 4 | 			<thead>
 5 | 				<tr>
 6 | 					<th>Name</th>
 7 | 					<th>Age</th>
 8 | 					<th>City</th>
 9 | 					<th>State</th>
10 | 				</tr>
11 | 			</thead>
12 | 			<tbody>
13 | 				<tr>
14 | 					<td>Joe Taxpayer</td>
15 | 					<td>33</td>
16 | 					<td>Chicago</td>
17 | 					<td>Illinois</td>
18 | 				</tr>
19 | 				<tr>
20 | 					<td>John Smith</td>
21 | 					<td>49</td>
22 | 					<td>Toledo</td>
23 | 					<td>Ohio</td>
24 | 				</tr>
25 | 				<tr>
26 | 					<td>Jane Doe</td>
27 | 					<td>17</td>
28 | 					<td>Tampa</td>
29 | 					<td>Florida</td>
30 | 				</tr>
31 | 			</tbody>
32 | 		</table>
33 | 	</body>
34 | </html>


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear on external disk
35 | .Spotlight-V100
36 | .Trashes
37 | 
38 | # Directories potentially created on remote AFP share
39 | .AppleDB
40 | .AppleDesktop
41 | Network Trash Folder
42 | Temporary Items
43 | .apdisk
44 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #Coding for Journalists
2 | 
3 | This class is an evolving introduction to coding and the Python programming language for journalists. In addition to a tour of the fundamentals, it spans four basic projects to get you started.
4 | 
5 | * A guide and all documentation live at [coding-for-journalists.rtfd.org](http://coding-for-journalists.rtfd.org)
6 | * The GitHub repository that includes all code is available from [github.com/ireapps/coding-for-journalists](https://github.com/ireapps/coding-for-journalists)
7 | * A good place to raise issues with the code or ask specific questions about the code is [also on GitHub](https://github.com/ireapps/coding-for-journalists/issues)
8 | * [Email](mailto:alex@ire.org) IRE Training Director Alex Richards, the primary author of this course, or [contact him on Twitter](http://www.twitter.com/alexrichards)


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: Coding for Journalists
 2 | site_description: A class to familiarize reporters with basic programming.
 3 | site_author: Alex Richards
 4 | 
 5 | repo_url: https://github.com/ireapps/coding-for-journalists/
 6 | 
 7 | pages:
 8 | - Main: index.md
 9 | - Setting up your computer: install.md
10 | - Introduction: pt1.md
11 | - Scraping the web: pt2.md
12 | - Parsing text across lines: pt3.md
13 | - Making a function: pt4.md
14 | - Geocoding addresses: pt5.md
15 | - APIs and databases: pt6.md
16 | - Unlocking data from databases: pt7.md
17 | - Cleaning data: pt8.md
18 | - Other scrapes: pt9.md
19 | - Text encoding and debugging: pt10.md
20 | - Next steps: next_steps.md
21 | theme: readthedocs
22 | 
23 | markdown_extensions:
24 |     - admonition
25 |     - codehilite
26 |     
27 | copyright: ©2015, <a href="http://www.ire.org">Investigative Reporters & Editors, Inc.</a>


--------------------------------------------------------------------------------
/docs/pt4.md:
--------------------------------------------------------------------------------
 1 | #### Making a function
 2 | 
 3 | This is a pretty quick task -- the next time we have to deal with a list from the same agency, we shouldn't have to spend time rewriting our code from scratch or even go back to revise it to handle a new file. We have something that works, so let's turn it into a function we can call whenever we need to parse addresses for a list of financial licensees.
 4 | 
 5 | This exercise contains the following files:
 6 | 
 7 | **payday_parser.py**: Our parser from [from the previous exercise](pt3.md). We'll turn the existing work into a function and generalize it a bit to handle a file that's not specifically the list of licensed payday lenders.
 8 | 
 9 | **consumer_installment.html**: Another listing from the state of Illinois, but this time it encompasses more than 1,000 licensed consumer installment lenders.
10 | 
11 | **call_function.py**: A script we'll write to call the parser function from **payday_parser.py** and direct it toward our HTML file.
12 | 
13 | Finished versions are in the **completed** folder.


--------------------------------------------------------------------------------
/1_intro/fun_with_subs.py:
--------------------------------------------------------------------------------
 1 | # Here are some strings with a similar structure:
 2 | 
 3 | some_string = "Alex is in Chicago, writing some code."
 4 | some_other_string = "John is in the park, feeding some birds."
 5 | and_another_string = "Sandra is in Austin, hiring some people."
 6 | promise_last_string = "Jenny is in an office, editing some stories."
 7 | 
 8 | # Redundant, right? The idea is that you can weave in information from other
 9 | # variables into a string with substitution; it's kind of like Madlibs.
10 | 
11 | 
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # But this format of substitution is being depreciated for a similar method.
18 | 
19 | 
20 | 
21 | # Numbers in the brackets let you specify position.
22 | 
23 | 
24 | 
25 | # Nice thing about .format? You can repeat variables when necessary:
26 | 
27 | 
28 | 
29 | # With substitution, we could handle printing all those strings above with
30 | # a loop.
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | # If we needed to add to these four categories:
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/10_encoding_debugging/encoding.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env
 2 | 
 3 | # If you're running into problems with encoding, your code probably mixes
 4 | # strings, which are stored in bytes, and unicode, which is stored in code
 5 | # points.
 6 | #
 7 | # If you get the dreaded UnicodeDecodeError, follow these steps.
 8 | #
 9 | # 1. DECODE EARLY
10 | # 2. DO YOUR STUFF
11 | # 3. ENCODE LATE
12 | 
13 | # Open an file and read the lines into a variable. This file uses Windows
14 | # encoding: cp1252
15 | with open('some_text.txt', 'rb') as infile:
16 | 	new_text = infile.readlines()
17 | 
18 | # Print a unicode string and the lines from the file we read in, which
19 | # will break because we're putting a string from the file with unicode.
20 | # Easy fix: decode each line and print.	
21 | for line in new_text:
22 | 	prefix = u"This is a line: "
23 | 	print prefix + line
24 | 
25 | # Save decoded strings to a new list here.
26 | 
27 | 
28 | 
29 | 
30 | 
31 | # This will come out as garbage unless we encode. Switch from new_text to
32 | # the decoded_text, and then encode during the write.
33 | with open('output_text.txt', 'wb') as outfile:
34 | 	for line in new_text:
35 | 		outfile.write(line)
36 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Investigative Reporters & Editors, Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/10_encoding_debugging/completed/encoding_done.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env
 2 | 
 3 | # If you're running into problems with encoding, your code probably mixes
 4 | # strings, which are stored in bytes, and unicode, which is stored in code
 5 | # points.
 6 | #
 7 | # If you get the dreaded UnicodeDecodeError, follow these steps.
 8 | #
 9 | # 1. DECODE EARLY
10 | # 2. DO YOUR STUFF
11 | # 3. ENCODE LATE
12 | 
13 | # Open an file and read the lines into a variable. This file uses Windows
14 | # encoding: cp1252
15 | with open('some_text.txt', 'rb') as infile:
16 | 	new_text = infile.readlines()
17 | 
18 | # Print a unicode string and the lines from the file we read in, which
19 | # will break because we're putting a string from the file with unicode.
20 | # Easy fix: decode each line and print.	
21 | for line in new_text:
22 | 	prefix = u"This is a line: "
23 | 	print prefix + line.decode('cp1252')
24 | 
25 | # Save decoded strings to a new list.
26 | decoded_text = []
27 | for line in new_text:
28 | 	decoded_text.append(line.decode('cp1252'))
29 | 
30 | # This will come out as garbage unless we encode.
31 | with open('output_text.txt', 'wb') as outfile:
32 | 	for line in decoded_text:
33 | 		outfile.write(line.encode('utf8'))
34 | 


--------------------------------------------------------------------------------
/8_cleaning/fun_with_excel.py:
--------------------------------------------------------------------------------
 1 | # Excel still has a place in your work life with Python. People have written
 2 | # libraries so that you can read and write the modern XML-based file format
 3 | # for Microsoft Excel. Note: xlrd and xlwt exist for older Excel formats (.xls)
 4 | 
 5 | 
 6 | 
 7 | # Let's create an Excel workbook.
 8 | 
 9 | 
10 | # It starts with a default sheet (called 'Sheet') but let's make another.
11 | 
12 | 
13 | # Print the names of the sheets in this new, unsaved workbook.
14 | 
15 | 
16 | # Set this new sheet to be the active sheet.
17 | 
18 | 
19 | # Print the name of the active sheet.
20 | 
21 | 
22 | # We'll select this new sheet (again) and start adding some basic data to it.
23 | 
24 | 
25 | # If we dislike the title, we can change it.
26 | 
27 | 
28 | # Let's assign the first three columns of the workbook some header names.
29 | 
30 | 
31 | 
32 | 
33 | # We can retrieve values in cells in a similar manner.
34 | 
35 | 
36 | # Let's make a list of values to put into the second row, right underneath.
37 | 
38 | 
39 | 
40 | 
41 | 
42 | # We can also comb through rows and get the output.
43 | 
44 | 
45 | 
46 | # One catch: it's basically all in memory until we write the workbook to disk.
47 | 
48 | 


--------------------------------------------------------------------------------
/2_web_scrape/fun_with_csv.py:
--------------------------------------------------------------------------------
 1 | # A common data format we know and love: delimited text files. Part of the
 2 | # standard library deals with CSVs (text files with Comma-Separated Values).
 3 | # This is just kind of modern shorthand for files with delimiters, and the
 4 | # csv library can deal with more than just commas.
 5 | 
 6 | 
 7 | 
 8 | # Let's try writing some stuff to a file first. Open a file and get a
 9 | # writer object started that will actually transcribe the data to the file.
10 | 
11 | 
12 | 
13 | 
14 | # Make a list with column headers and write it to the file: FIRSTNAME,
15 | # LASTNAME, CITY.
16 | 
17 | 
18 | 
19 | 
20 | # Based on the headers, write two more rows to the file.
21 | 
22 | 
23 | 
24 | 
25 | # Close the file, otherwise the data may only be partially recorded. And by
26 | # "otherwise," I mean definitely.
27 | 
28 | 
29 | 
30 | # So another thing we can do as far as opening a file, doing something with
31 | # the contents and then closing it is using a slightly different syntax: with.
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | # With the file open, do some stuff, and then the file will close
39 | # automatically when the code inside the with statement is concluded. Not
40 | # perfect for every situation, but it can be slightly more elegant sometimes.
41 | 


--------------------------------------------------------------------------------
/6_from_apis/fun_with_datetime.py:
--------------------------------------------------------------------------------
 1 | # Date and time follow slightly different conventions in Python then other
 2 | # places you've probably used them before.
 3 | 
 4 | 
 5 | 
 6 | # Get the current date and time and stick it in a variable called 'now.'
 7 | 
 8 | 
 9 | # That object containing year, month, day, hour, minute, second and
10 | # millisecond is now frozen in that variable. Printing it will send a more
11 | # readable string.
12 | 
13 | 
14 | 
15 | # We can isolate parts of the datetime object, too.
16 | 
17 | 
18 | 
19 | 
20 | # It has a few built-in formats that might look familiar to you.
21 | 
22 | 
23 | 
24 | # You can also take the reins and output a string based on how you need it
25 | # to be displayed: https://docs.python.org/2/library/time.html#time.strftime
26 | 
27 | 
28 | # We can, of course, also make our own datetime object.
29 | 
30 | 
31 | 
32 | 
33 | # This is especially useful when you're trying to gauge the difference
34 | # between dates, something we frequently have to do in analysis. Let's do
35 | # some simple math to see how may days until January 1, 2018.
36 | 
37 | 
38 | 
39 | # What gets returned by that math is a datetime.timedelta, and we can use
40 | # that to add or subtract time, too. What is the date 219 days and two hours
41 | # from now?
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/docs/pt9.md:
--------------------------------------------------------------------------------
 1 | #### Other scrapes
 2 | 
 3 | Scraping can be about more than parsing tables tags in HTML.
 4 | 
 5 | The first thing we'll do is collect files from a website with Python.
 6 | 
 7 | Then we'll turn out attention to this common problem: ```javascript:__doPostBack()``` isn't a link you can follow with a click in Python, but sometimes the data you need is behind it. 
 8 | 
 9 | If we're trying to collect data from a government website using ASP.NET, we can watch the transaction between our browser and the site unfurl using developer tools like the ones built into Chrome. The requests library isn't just good for fetching URLs — it's full service. It can POST information as well with the intention of getting a response from the site. Based on what we see in terms of exhanged headers, we can copy that information and use requests to send it ourselves.
10 | 
11 | The files we'll be using:
12 | 
13 | - **other_scrapes.py**: This script will collect a set of PDFs. We'll point BeautifulSoup at the page contents to collect the links and then pipe the contents of those links (the PDFs) to files on our machine.
14 | 
15 | - **other_scrapes_post.py**: We'll use requests to POST data to the Illinois Elections site, causing its ASP.NET framework to give us a tab-delimited text file that contain's the days political contributions. 


--------------------------------------------------------------------------------
/1_intro/var.py:
--------------------------------------------------------------------------------
 1 | # Hi there! This file contains some variables that represent different kinds of data
 2 | # structures in Python. Part of the lesson will involve you creating your own to see 
 3 | # how they work, but we can also load these directly without having to type a bunch
 4 | # of stuff out.
 5 | #
 6 | # By the way--any line of text that begins with '#' is considered a comment; the Python
 7 | # interpreter won't touch it. It's a great place to make notes about what your code
 8 | # is doing.
 9 | #
10 | 
11 | # ----------------------
12 | # SOME VARIABLES TO LOAD
13 | # ----------------------
14 | 
15 | # An integer
16 | lucky_number = 7
17 | 
18 | 
19 | # A string
20 | sentiment = 'I\'m moderately excited about learning some Python!'
21 | 
22 | 
23 | # A float
24 | gas_price = 3.44
25 | 
26 | 
27 | # A boolean
28 | at_bootcamp = True
29 | 
30 | 
31 | # A list
32 | months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov']
33 | 
34 | 
35 | # A dictionary
36 | person_info = {'first_name': 'James', 'last_name': 'Halpert', 'middle': 'D', 'city': 'Philadelphia'}
37 | 
38 | 
39 | # A list that's made up of lists
40 | multi_list = [['Apple', 'Banana', 'Pear'], ['Ford', 'Toyota', 'Volkswagen', 'Buick'], [5, 3, 42]]
41 | 
42 | 
43 | # A string with too much whitespace
44 | ugly_string = '          He              loves   San  Dimas    '
45 | 


--------------------------------------------------------------------------------
/2_web_scrape/fun_with_bs.py:
--------------------------------------------------------------------------------
 1 | # This is a quick primer on BeautifulSoup and some of its key abilites. For
 2 | # more, see the docs: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 3 | 
 4 | 
 5 | 
 6 | # The sequence of events with BeautifulSoup:
 7 | 
 8 | # 1. Get a file. Maybe you have requests playing web browser and it will hand
 9 | #    you the file contents. Here we'll just read a file into a variable.
10 | 
11 | 
12 | 
13 | 
14 | # 2. Make a BeautifulSoup object out of HTML file contents. This makes the
15 | #    underlying HTML something BeautifulSoup can navigate and parse.
16 | 
17 | 
18 | 
19 | # 2a. Peek at the HTML that BS has gone to work on, if you'd like.
20 | 
21 | 
22 | 
23 | # 3. Isolate the information that you want to collect. This is where BS
24 | #    really shines. This is an example of very simple criteria: HTML
25 | #    within <table> tags.
26 | 
27 | 
28 | 
29 | # 4. Start walking through this isolated information; for a table, the pattern
30 | #    generally dives into each row and then each cell.
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | # for row in table:
39 | #     make empty list to hold cell text
40 | #     for cell in row:
41 | #         append cell text to list
42 | #     print the list contents joined together by commas
43 | 
44 | # This scraped information can then be written to a file or manipulated
45 | # further.
46 | 


--------------------------------------------------------------------------------
/docs/pt3.md:
--------------------------------------------------------------------------------
 1 | #### Parsing text across lines
 2 | 
 3 | It would be great if every piece of data you came across was in a format that lent itself to easy capture. In the same vein as the text extraction from reactor detail pages in our last example, we're going to pick apart an HTML file of licensed payday lenders (that's mostly text) and turn it into a flat CSV file where one row is one record.
 4 | 
 5 | In this file, addresses can span three, four or five lines. Sometimes it's on four lines because a lender does business in Illinois under another name; in others, it's because the lender operates out of a suite, room or building stored on a line separate from the street address. This means that our script needs to behave four different ways depending on how many lines it encounters for each address, and we'll switch among those behaviors with ```if/elif``` syntax.
 6 | 
 7 | We'll again use ```BeautifulSoup```, but primarily to break out the portion of the file we want to capture for the resulting CSV.
 8 | 
 9 | This exercise has the following files:
10 | 
11 | - **payday.py**: The file we'll use to write our address parser, following the comments.
12 | 
13 | - **payday_lenders.html**: A simple HTML file that lists nearly 500 payday lenders licensed to do business in Illinois. Their addresses are split across multiple lines.
14 | 
15 | As with other exercises, finished versions are in the **completed** folder.
16 | 


--------------------------------------------------------------------------------
/6_from_apis/fun_with_sqlite.py:
--------------------------------------------------------------------------------
 1 | # SQLite is a lightweight database manager that's part of Python's standard
 2 | # library, so it's a good example of how to hook a script up to a database.
 3 | # If you work in MySQL or Postgres, there are libraries you can use to make
 4 | # a connection and gain similar functionality.
 5 | 
 6 | 
 7 | 
 8 | # Connect to a test database; if one doesn't exist, it will be created on
 9 | # the fly. We also fire up a cursor to poke, prod and manipulate our
10 | # database.
11 | 
12 | 
13 | 
14 | # Right now it's an empty database with no tables and no data. Let's create
15 | # basic one that holds some CEO information.
16 | 
17 | 
18 | 
19 | 
20 | # NOTE: with scripts, somestimes it's a good idea to preface a CREATE
21 | # TABLE query with IF NOT EXISTS, that way you won't get an operational
22 | # error.
23 | 
24 | # Let's insert three CEO names, companies and salaries into our ceos table.
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 
31 | # When we alter a table, we have to commit those changes.
32 | 
33 | 
34 | # Let's run a quick query that gives us everything in the table.
35 | 
36 | 
37 | 
38 | # The database has run the query and gives it back to use as a list of tuples
39 | # for each row. We have to fetch this information.
40 | 
41 | 
42 | 
43 | # Try fetchall() again; it should be empty and will be until we run another
44 | # query.
45 | 
46 | 
47 | # Let's try another basic query: a sum of the salaries.
48 | 
49 | 
50 | 
51 | 
52 | 
53 | # One more: companies that start with 'A,' sorted in descending order by
54 | # salary
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/site/license/highlight.js/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2006, Ivan Sagalaev
 2 | All rights reserved.
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are met:
 5 | 
 6 |     * Redistributions of source code must retain the above copyright
 7 |       notice, this list of conditions and the following disclaimer.
 8 |     * Redistributions in binary form must reproduce the above copyright
 9 |       notice, this list of conditions and the following disclaimer in the
10 |       documentation and/or other materials provided with the distribution.
11 |     * Neither the name of highlight.js nor the names of its contributors 
12 |       may be used to endorse or promote products derived from this software 
13 |       without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | 


--------------------------------------------------------------------------------
/8_cleaning/clean_csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | 
 5 | 
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 | # ['LASTONLYNAME', 'FIRSTNAME', 'EXPENDEDDATE', 'AMOUNT', 'ADDRESS1',
13 | # 'ADDRESS2', 'CITY', 'STATE', 'ZIP', 'EXPTYPE', 'PURPOSE', 'BENEFICIARY']
14 | 
15 | # Let's just go ahead and write a function for this.
16 | 
17 | 
18 | 
19 |     # Let's worry about the columns with problems.
20 |     # LASTONLYNAME needs to be uppercase.
21 | 
22 |     # AMOUNT suffers from whitespace and dollar signs
23 | 
24 |     # CITY contains some problematic spellings of 'Chicago'
25 |     # and non-breaking spaces for display (&NBSP;)
26 | 
27 | 
28 | 
29 | 
30 |     # ZIP has leading zeros removed
31 | 
32 | 
33 | 
34 | 
35 |     # One thing with PURPOSE: there's additional detail after
36 |     # a - or /. We can use regex to specify multiple split criteria.
37 | 
38 |     # Have to set an if/else up for if it doesn't have one of these.
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 |     # There are also synonymous words present: 'fee,' 'fees,'
46 |     # 'cost,' 'costs,' 'expense.' Replacing this with 'expenses'
47 |     # will go a long way toward cleaner categories.
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 |     # All done; let's return a revised row that contains our fixes and
57 |     # rows we didn't touch.
58 | 
59 | 
60 | 
61 | # Our file is loaded and ready to go. We have a cleaning function. Let's fix the
62 | # headers to match our file changes before we loose the function on the file.
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 |         # Here's where we can weed out non-expenditures from hitting our clean file.
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/2_web_scrape/completed/fun_with_csv_done.py:
--------------------------------------------------------------------------------
 1 | # A common data format we know and love: delimited text files. Part of the
 2 | # standard library deals with CSVs (text files with Comma-Separated Values).
 3 | # This is just kind of modern shorthand for files with delimiters, and the
 4 | # csv library can deal with more than just commas.
 5 | 
 6 | import csv
 7 | 
 8 | # Let's try writing some stuff to a file first. Open a file and get a
 9 | # writer object started that will actually transcribe the data to the file.
10 | 
11 | outfile = open('my_test.csv', 'wb')
12 | csv_writer = csv.writer(outfile)
13 | 
14 | # Make a list with column headers and write it to the file: FIRSTNAME,
15 | # LASTNAME, CITY.
16 | 
17 | headers = ['FIRSTNAME', 'LASTNAME', 'CITY']
18 | csv_writer.writerow(headers)
19 | 
20 | # Based on the headers, write two more rows to the file.
21 | 
22 | csv_writer.writerow(['Alex', 'Richards', 'Chicago'])
23 | csv_writer.writerow(['John', 'Smith', 'New York'])
24 | 
25 | # Close the file, otherwise the data may only be partially recorded. And by
26 | # "otherwise," I mean definitely.
27 | 
28 | outfile.close()
29 | 
30 | # So another thing we can do as far as opening a file, doing something with
31 | # the contents and then closing it is using a slightly different syntax: with.
32 | 
33 | with open('my_test.csv', 'rb') as infile:
34 |     csv_reader = csv.reader(infile)
35 |     for row in csv_reader:
36 |         print row
37 | 
38 | # With the file open, do some stuff, and then the file will close
39 | # automatically when the code inside the with statement is concluded. Not
40 | # perfect for every situation, but it can be slightly more elegant sometimes.
41 | 


--------------------------------------------------------------------------------
/6_from_apis/completed/fun_with_datetime_done.py:
--------------------------------------------------------------------------------
 1 | # Date and time follow slightly different conventions in Python then other
 2 | # places you've probably used them before.
 3 | 
 4 | import datetime
 5 | 
 6 | # Get the current date and time and stick it in a variable called 'now.'
 7 | now = datetime.datetime.now()
 8 | 
 9 | # That object containing year, month, day, hour, minute, second and
10 | # millisecond is now frozen in that variable. Printing it will send a more
11 | # readable string.
12 | now
13 | print now
14 | 
15 | # We can isolate parts of the datetime object, too.
16 | print now.year
17 | print now.hour
18 | print now.day
19 | 
20 | # It has a few built-in formats that might look familiar to you.
21 | print now.ctime()
22 | print now.isoformat()
23 | 
24 | # You can also take the reins and output a string based on how you need it
25 | # to be displayed: https://docs.python.org/2/library/time.html#time.strftime
26 | print now.strftime('%m/%d/%y %I:%M%p')
27 | 
28 | # We can, of course, also make our own datetime object.
29 | my_datetime = datetime.date(2014, 5, 7)
30 | print my_datetime.isoformat()
31 | print my_datetime.month
32 | 
33 | # This is especially useful when you're trying to gauge the difference
34 | # between dates, something we frequently have to do in analysis. Let's do
35 | # some simple math to see how may days until January 1, 2018.
36 | diff = datetime.datetime(2018, 1, 1) - now
37 | print diff.days
38 | 
39 | # What gets returned by that math is a datetime.timedelta, and we can use
40 | # that to add or subtract time, too. What is the date 219 days and two hours
41 | # from now?
42 | the_future = now + datetime.timedelta(days=219, hours=2)
43 | print the_future.ctime()
44 | 


--------------------------------------------------------------------------------
/2_web_scrape/completed/fun_with_bs_done.py:
--------------------------------------------------------------------------------
 1 | # This is a quick primer on BeautifulSoup and some of its key abilites. For
 2 | # more, see the docs: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | # The sequence of events with BeautifulSoup:
 7 | 
 8 | # 1. Get a file. Maybe you have requests playing web browser and it will hand
 9 | #    you the file contents. Here we'll just read a file into a variable.
10 | 
11 | with open('table_example.html', 'rb') as infile:
12 |     example = infile.read()
13 | 
14 | # 2. Make a BeautifulSoup object out of HTML file contents. This makes the
15 | #    underlying HTML something BeautifulSoup can navigate and parse.
16 | 
17 | soup = BeautifulSoup(example, 'html.parser')
18 | 
19 | # 2a. Peek at the HTML that BS has gone to work on, if you'd like.
20 | 
21 | print soup.prettify()
22 | 
23 | # 3. Isolate the information that you want to collect. This is where BS
24 | #    really shines. This is an example of very simple criteria: HTML
25 | #    within <table> tags.
26 | 
27 | table = soup.find('table')
28 | 
29 | # 4. Start walking through this isolated information; for a table, the pattern
30 | #    generally dives into each row and then each cell.
31 | 
32 | for row in table.find_all('tr'):
33 |     extract = []
34 |     for cell in row.find_all('td'):
35 |         extract.append(cell.text)
36 |     print ', '.join(extract)
37 | 
38 | # for row in table:
39 | #     make empty list to hold cell text
40 | #     for cell in row:
41 | #         append cell text to list
42 | #     print the list contents joined together by commas
43 | 
44 | # This scraped information can then be written to a file or manipulated
45 | # further.
46 | 


--------------------------------------------------------------------------------
/8_cleaning/completed/fun_with_excel_done.py:
--------------------------------------------------------------------------------
 1 | # Excel still has a place in your work life with Python. People have written
 2 | # libraries so that you can read and write the modern XML-based file format
 3 | # for Microsoft Excel. Note: xlrd and xlwt exist for older Excel formats (.xls)
 4 | 
 5 | from openpyxl import Workbook
 6 | 
 7 | # Let's create an Excel workbook.
 8 | wb = Workbook()
 9 | 
10 | # It starts with a default sheet (called 'Sheet') but let's make another.
11 | wb.create_sheet('MyStuff')
12 | 
13 | # Print the names of the sheets in this new, unsaved workbook.
14 | print wb.sheetnames
15 | 
16 | # Set this new sheet to be the active sheet.
17 | wb.active = 1
18 | 
19 | # Print the name of the active sheet.
20 | print wb.active
21 | 
22 | # We'll select this new sheet (again) and start adding some basic data to it.
23 | ws = wb.get_sheet_by_name('MyStuff')
24 | 
25 | # If we dislike the title, we can change it.
26 | ws.title = 'OtherStuff'
27 | 
28 | # Let's assign the first three columns of the workbook some header names.
29 | ws['A1'] = 'Name'
30 | ws['B1'] = 'Company'
31 | ws['C1'] = 'Salary'
32 | 
33 | # We can retrieve values in cells in a similar manner.
34 | ws['B1'].value
35 | 
36 | # Let's make a list of values to put into the second row, right underneath.
37 | person = ['Laura Green', 'Dynamic Dynamics', '12550000']
38 | 
39 | for detail in person:
40 |     ws.cell(column=person.index(detail)+1, row=2, value=detail)
41 | 
42 | # We can also comb through rows and get the output.
43 | for row in ws.rows[1:]:
44 |     print ', '.join([row[0].value, row[1].value, row[2].value])
45 | 
46 | # One catch: it's basically all in memory until we write the workbook to disk.
47 | wb.save('mytest.xlsx')
48 | 


--------------------------------------------------------------------------------
/2_web_scrape/scrape.py:
--------------------------------------------------------------------------------
 1 | # PROBLEM: We want to scrape the contents of a table into a delimited file.
 2 | #
 3 | # HOW WE'RE GOING TO DEAL WITH IT:
 4 | #   - Use the requests library to grab the page source
 5 | #   - Use BeautifulSoup to navigate to the table and then loop through its rows
 6 | #   - Write it all to a csv file
 7 | #   - Handle some encoding issues
 8 | 
 9 | # Let's import the libraries we'll be using for this parsing task: BeautifulSoup from bs4,
10 | # Python's csv library and requests
11 | 
12 | 
13 | 
14 | 
15 | # The data table we want is at
16 | # 'http://www.nrc.gov/reactors/operating/list-power-reactor-units.html'
17 | 
18 | 
19 | # Fetch the page with requests, set the page encoding and turn it into a BeautifulSoup
20 | # object.
21 | 
22 | 
23 | 
24 | 
25 | # This page only has one table, so it's not much work to find it.
26 | 
27 | 
28 | # Let's make a new, empty csv file and a csv.writer which will take write our
29 | # table one row at a time to the file.
30 | 
31 | 
32 | # The first row written will be our field names.
33 | 
34 | 
35 | # We want to loop through all <tr> tags (rows) except for the header.
36 | 
37 |     # Each <tr> has some <td> (cells) below it; these are what we'll move into variables
38 |     # and then write to the csv.
39 | 
40 |     # Reactor name, detail page link and docket number are all part of the first cell.
41 |     # Docket has a bunch of whitespace, so we'll .strip() it.
42 | 
43 | 
44 | 
45 | 
46 |     # Two fields in this table have characters like en dash; we need to make sure these are
47 |     # encoded properly when writing to the csv or it will break our script.
48 | 
49 | 
50 | 
51 | 
52 |     # Once everything's collected, write it as a row in the csv.
53 | 
54 | 
55 | # Close the file and let us know it's finished.
56 | 


--------------------------------------------------------------------------------
/1_intro/completed/fun_with_subs_done.py:
--------------------------------------------------------------------------------
 1 | # Here are some strings with a similar structure:
 2 | 
 3 | some_string = "Alex is in Chicago, writing some code."
 4 | some_other_string = "John is in the park, feeding some birds."
 5 | and_another_string = "Sandra is in Austin, hiring some people."
 6 | promise_last_string = "Jenny is in an office, editing some stories."
 7 | 
 8 | # Redundant, right? The idea is that you can weave in information from other
 9 | # variables into a string with substitution; it's kind of like Madlibs.
10 | 
11 | name = 'Robby'
12 | print 'Where is %s?' % name
13 | 
14 | age = 47
15 | print '%s is %d years old.' % (name, age)
16 | 
17 | # But this format of substitution is being depreciated for a similar method.
18 | 
19 | print 'Where is {}?'.format(name)
20 | 
21 | # Numbers in the brackets let you specify position.
22 | 
23 | print '{0} is {1} years old.'.format(name, age)
24 | 
25 | # Nice thing about .format? You can repeat variables when necessary:
26 | 
27 | print "Where's {0}, that adorable {1}-year-old? Oh, there's {0}.".format(name, age)
28 | 
29 | # With substitution, we could handle printing all those strings above with
30 | # a loop.
31 | 
32 | names = ['Alex', 'John', 'Sandra', 'Jenny']
33 | locs = ['Chicago', 'the park', 'Austin', 'an office']
34 | actions = ['writing', 'feeding', 'hiring', 'editing']
35 | stuff = ['code', 'birds', 'people', 'stories']
36 | 
37 | for x in range(0, len(names)):
38 |     print "{0} is in {1}, {2} some {3}.".format(names[x], locs[x], actions[x], stuff[x])
39 | 
40 | 
41 | # If we needed to add to these four categories:
42 | 
43 | names.append('Roger')
44 | locs.append('Pasadena')
45 | actions.append('baking')
46 | stuff.append('cookies')
47 | 
48 | print "{0} is in {1}, {2} some {3}.".format(names[-1], locs[-1], actions[-1], stuff[-1])
49 | 


--------------------------------------------------------------------------------
/docs/pt8.md:
--------------------------------------------------------------------------------
 1 | #### Cleaning data
 2 | 
 3 | No data are ever perfect, but that doesn't mean you're not going to have to tangle with its shortcomings on deadline. Once we assess the issues, we can outsource the tedium of cleaning to a Python script. If we ever receive an updated data set from the same source, we have an automated way to run through out and output something we can use for reporting.
 4 | 
 5 | This exercise hits three different kinds of cleaning that are fairly common:
 6 | 
 7 | - Data that don't arrive in flat file format, where one row equals one record. We'll work through a very formatted Excel file of candidate filings and convert it to a worksheet we can use.
 8 | 
 9 | - Data that are just plain dirty: leading and trailing spaces, misspellings, unnecessary characters and more.
10 | 
11 | - Parsing out names. We'll scrape some board and commission pages from the city of Chicago's website and use some hints in table to help us split names into title, first, middle, last and suffix.
12 | 
13 | The files:
14 | 
15 | - **excel.py**: The script we'll write to turn formatted Excel designed for printing into a flat file we can use for analysis and reporting.
16 | 
17 | - **fun_with_excel.py**: A tour of the ```openpyxl``` library and how it can read and write Microsoft Excel formats from 2007 and later.
18 | 
19 | - **clean_csv.py**: We'll write Python in this to process a dirty CSV file.
20 | 
21 | - **names.py**: The script that will scrape board and commission pages and process the names.
22 | 
23 | - **Candidates.xlsx**: Our formatted Excel file of campaign filings.
24 | 
25 | - **Candidates_backup.xlsx**: A backup of the formatted Excel file (in case we mess it up). 
26 | 
27 | - **rahm_spending.csv**: The problematic CSV file with dirty columns. 
28 | 
29 | In case of no internet, **boards_backup** contains the files to scrape for **names.py**. Finished versions are in **completed**.


--------------------------------------------------------------------------------
/5_geocode/geocode.py:
--------------------------------------------------------------------------------
 1 | # PROBLEM: We need coordinates for the addresses we parsed earlier.
 2 | #
 3 | # HOW WE'RE GOING TO DEAL WITH IT:
 4 | #   - Use a library called geopy to run an address through Google's geocoder
 5 | #   - Use a sleep function to pause so we don't swamp Google
 6 | #   - Keep track of it all with a keyed dictionary (coming in and going out)
 7 | #   - Put the original information AND our returned lat/long coordinates into a
 8 | #     new csv file
 9 | 
10 | # Import the Google geocoder from geopy as well as Python's csv and time libaries
11 | 
12 | 
13 | # Make a geolocator object
14 | 
15 | 
16 | # Open our address file and start a DictReader object that will give each element in
17 | # each row a key/value pair based on the header columns in the file.
18 | 
19 | 
20 | # We'll go ahead and set up a new file for our eventual output.
21 | 
22 | # For the DictWriter, we have to give it a list of fields from the get-go to establish
23 | # the order; we'll go ahead and write a header to the file, too.
24 | 
25 | 
26 | 
27 | # Start for loop here
28 | 
29 |     # We're going to put an if/else here to prevent the whole class from launching a
30 |     # volley of 500 requests at Google. Let's get the first five (row 1 is the header).
31 | 
32 |         # Put the address in a Google-recognizable string: ADDRESS, CITY, STATE ZIP
33 | 
34 |         # Geocode that string
35 | 
36 |         # Plug results from the geocoder right back into the same row of data with new key
37 |         # values: the returned latitude, longitude and address Google matched on.
38 | 
39 | 
40 | 
41 |         # Write the modified row to our new csv.
42 | 
43 |         # To keep tabs on what's happening, get a printed message with address and line.
44 | 
45 |         # Before we do all of this with the next row, pause for two seconds.
46 | 
47 | 
48 | 
49 | 
50 | # Alert us with a printed message when this completes and close both files.
51 | 


--------------------------------------------------------------------------------
/docs/pt10.md:
--------------------------------------------------------------------------------
 1 | #### Encoding and debugging
 2 | It's hard to succinctly describe why text encoding is a problem. Basically, it stems from the early days of computing when Western characters mattered. Others, not so much — they were barely an afterthought. 
 3 | 
 4 | By the way, the em-dash on the last sentence? That can be represented many different ways based on the encoding of the text.
 5 | 
 6 | Different encodings translate the bits that underpin all manner of characters into what we actually see on screen. Many have the same for A through Z, 0 through 9 and basic punctuation, so you don't even know you have a problem.
 7 | 
 8 | For more information, read this [sweeping overview](http://www.joelonsoftware.com/articles/Unicode.html).
 9 | 
10 | When it's a problem, the trick is to decode your input early into [unicode](https://en.wikipedia.org/wiki/Unicode), do whatever you need to do with your code, and then encode back into something like UTF8 when it's been written to a file.
11 | 
12 | We encountered this a bit during our scrape at the beginning of the workshop. We had a few columns in the table with these other characters, and we were combining them with regular strings into individual lines that would be written to a CSV. We encoded them before they were written so they didn't break our script.
13 | 
14 | Files to use:
15 | 
16 | - **encoding.py**: A script where we're trying to read a text file, print the lines to the screen and then write the output to another text file. It's not working out so well.
17 | 
18 | - **broken_code.png**: Flow chart for common errors in Python, what they mean and what else you can check when things aren't working out exactly as you expect them to.
19 | 
20 | - **some_text.txt**: A simple text file, encoded in Windows-1252, known by [Python as 'cp1252.'](https://docs.python.org/2/library/codecs.html#standard-encodings)
21 | 
22 | As always, a working version of the script is in **completed**.


--------------------------------------------------------------------------------
/8_cleaning/excel.py:
--------------------------------------------------------------------------------
 1 | #! usr/bin/python
 2 | 
 3 | 
 4 | 
 5 | 
 6 | # We're going to be messing with an existing file, so let's clone our Excel
 7 | # file just to be on the safe side.
 8 | 
 9 | 
10 | 
11 | # Load our Excel workbook into memory
12 | 
13 | 
14 | 
15 | # List the sheets we find in the workbook
16 | 
17 | 
18 | 
19 | # Active sheet will default to the first; we can also select it
20 | 
21 | 
22 | # We can iterate through rows an do things based on the pattern that
23 | # we find. First, though, we need to decide where to put the data.
24 | # Why not another blank sheet in the same book?
25 | 
26 | 
27 | 
28 | # Right now there's nothing on the sheet; the max row and column would be '1'
29 | 
30 | 
31 | 
32 | # Max_row can help us keep track of where we are as we write to the book.
33 | # Let's start by writing a header for all the information we're going to
34 | # collect.
35 | 
36 | 
37 | 
38 | 
39 | 
40 | # This is our first modification to the workbook; let's save our changes.
41 | 
42 | 
43 | # We need something outside the loop to hold onto the race name
44 | 
45 | 
46 | 
47 | # Let's walk through each row in our sheet with .rows
48 | 
49 |     # Headers are duplicated for each race in this sheet; we can grab the
50 |     # race name in the cell directly above and hold onto it IF we encounter
51 |     # that first header item, "Candidate's Name."
52 | 
53 | 
54 | 
55 |     # The other condition to hunt for: Candidate info is spread out over two
56 |     # rows. If there's something in the first and second columns, that means
57 |     # it's a candidate, and we can go to work on that row and the row below it.
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 |         # Let's break apart the combined city, state and zip
66 | 
67 | 
68 | 
69 | 
70 | 
71 |         # Let's write the row to Sheet2, which we've already queued up. Watch
72 |         # out for the date, which we'll reformat from a Python date object.
73 | 
74 | 
75 | 
76 | 
77 | 
78 | # Save changes to the file.
79 | 
80 | 


--------------------------------------------------------------------------------
/4_make_function/payday_parser.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import csv
 3 | 
 4 | # Let's bundle everything we did previously into a function. Let's call it 'ParseMyAddresses'. We'll pass it one variable,
 5 | # the HTML file that we want parsed.
 6 | 
 7 | # Because indentation controls how Python reads our code, we need to tab all the code over one spot under the function def.
 8 | # Instead of explicitly giving Python a file name, we'll have the function pass that information along.
 9 | html = open('payday_lenders.html', 'rb')
10 | soup = BeautifulSoup(html)
11 | 
12 | block = unicode(soup.body)
13 | block = block.replace('<hr/><br/>', '')
14 | 
15 | list = block.split('<br/><br/>')
16 | list = list[2:len(list)-1]
17 | # And instead of specifying a csv file name, we'll make one out of the HTML file, replacing '.html' with '.csv'
18 | csv_file = open('payday_lenders.csv', 'wb')
19 | output = csv.writer(csv_file)
20 | output.writerow(['NAME', 'DBA', 'STADDR', 'STADDR2', 'CITY', 'STATE', 'ZIP'])
21 | 
22 | for lender in list:
23 |     details = lender.split('<br/>')
24 |     name = details[0].strip()
25 |     if len(details) == 3:
26 |         dba = ''
27 |         staddr = details[1].strip()
28 |         staddr2 = ''
29 |     elif len(details) == 4 and details[1].upper().startswith('D/B/A'):
30 |         dba = details[1].strip()
31 |         staddr = details[2].strip()
32 |         staddr2 = ''
33 |     elif len(details) == 4 and not details[1].upper().startswith('D/B/A'):
34 |         dba = ''
35 |         staddr = details[1].strip()
36 |         staddr2 = details[2].strip()
37 |     elif len(details) == 5:
38 |         dba = details[1].strip()
39 |         staddr = details[2].strip()
40 |         staddr2 = details[3].strip()
41 |     citystzip = details[len(details)-1].split(', ')
42 |     city = citystzip[0].strip()
43 |     state = citystzip[1].strip()
44 |     zip = citystzip[2].strip()
45 |     output.writerow([name, dba, staddr, staddr2, city, state, zip])
46 | 
47 | csv_file.close()
48 | 


--------------------------------------------------------------------------------
/site/js/theme.js:
--------------------------------------------------------------------------------
 1 | $( document ).ready(function() {
 2 | 
 3 |     // Shift nav in mobile when clicking the menu.
 4 |     $(document).on('click', "[data-toggle='wy-nav-top']", function() {
 5 |       $("[data-toggle='wy-nav-shift']").toggleClass("shift");
 6 |       $("[data-toggle='rst-versions']").toggleClass("shift");
 7 |     });
 8 | 
 9 |     // Close menu when you click a link.
10 |     $(document).on('click', ".wy-menu-vertical .current ul li a", function() {
11 |       $("[data-toggle='wy-nav-shift']").removeClass("shift");
12 |       $("[data-toggle='rst-versions']").toggleClass("shift");
13 |     });
14 | 
15 |     $(document).on('click', "[data-toggle='rst-current-version']", function() {
16 |       $("[data-toggle='rst-versions']").toggleClass("shift-up");
17 |     });
18 | 
19 |     // Make tables responsive
20 |     $("table.docutils:not(.field-list)").wrap("<div class='wy-table-responsive'></div>");
21 | 
22 |     hljs.initHighlightingOnLoad();
23 | 
24 |     $('table').addClass('docutils');
25 | });
26 | 
27 | window.SphinxRtdTheme = (function (jquery) {
28 |     var stickyNav = (function () {
29 |         var navBar,
30 |             win,
31 |             stickyNavCssClass = 'stickynav',
32 |             applyStickNav = function () {
33 |                 if (navBar.height() <= win.height()) {
34 |                     navBar.addClass(stickyNavCssClass);
35 |                 } else {
36 |                     navBar.removeClass(stickyNavCssClass);
37 |                 }
38 |             },
39 |             enable = function () {
40 |                 applyStickNav();
41 |                 win.on('resize', applyStickNav);
42 |             },
43 |             init = function () {
44 |                 navBar = jquery('nav.wy-nav-side:first');
45 |                 win    = jquery(window);
46 |             };
47 |         jquery(init);
48 |         return {
49 |             enable : enable
50 |         };
51 |     }());
52 |     return {
53 |         StickyNav : stickyNav
54 |     };
55 | }($));
56 | 


--------------------------------------------------------------------------------
/docs/pt5.md:
--------------------------------------------------------------------------------
 1 | ### Geocoding addresses
 2 | 
 3 | For any kind of analysis that involves mapping, having coordinates is a must. En masse, though, they aren't always easy to come by.
 4 | 
 5 | Open-source geographic information systems like [QGIS](http://www.qgis.org) don't have built-in options to figure out a location's latitude and longitude; [Esri](http://www.esri.com/software/arcgis/arcgisonline/credits) charges for some of its online geocoding services, too.
 6 | 
 7 | Enter [geopy](https://github.com/geopy/geopy), a Python library that's designed to interact with a slew of third-party geolocation APIs. As part of a larger script, we can read lines of address data from a CSV file, send each through one of these services and return the results.
 8 | 
 9 | In this example, we'll be using Google's geocoding service, which is good at properly interpreting an address string even when it's partially malformed or contains extraneous information. As a free service, however, it will only geocode 2,500 addresses in a 24-hour period.
10 | 
11 | We'll be using the CSV file we made by parsing payday lender addresses from 3\_parse\_addresses. Our goal at the end is to have a new CSV file with three additional fields of information:
12 | 
13 | - Google's match for the address
14 | - Latitude in decimal degrees (Y coordinate)
15 | - Longitude in decimal degrees (X coordinate)
16 | 
17 | Google's free geocoder can only handle five requests per second, so we're going to use Python's time functions to slow our requests down and set up a control flow so that we're all only geocoding the first five addresses -- a condition that can be removed at your discretion if you're working through this task on your own later.
18 | 
19 | The files we'll be working with:
20 | 
21 | - **geocode.py**: A script we'll write to pass addresses through Google's geocoding service. It will take the results along with our initial data fields and send them all to a new CSV file.
22 | 
23 | - **payday_lenders.csv**: Our completed CSV file from 3\_parse\_addresses.
24 | 
25 | A finished version is in **completed**.


--------------------------------------------------------------------------------
/4_make_function/completed/payday_parser_done.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import csv
 3 | 
 4 | # Let's bundle everything we did previously into a function. Let's call it 'ParseMyAddresses'. We'll pass it one variable,
 5 | # the HTML file that we want parsed.
 6 | 
 7 | 
 8 | def ParseMyAddresses(html_file):
 9 | # Because indentation controls how Python reads our code, we need to tab all the code over one spot under the function def.
10 |     # Instead of explicitly giving Python a file name, we'll have the function pass that information along.
11 |     html = open(html_file, 'rb')
12 |     soup = BeautifulSoup(html)
13 | 
14 |     block = unicode(soup.body)
15 |     block = block.replace('<hr/><br/>', '')
16 | 
17 |     list = block.split('<br/><br/>')
18 |     list = list[2:len(list)-1]
19 |     # And instead of specifying a csv file name, we'll make one out of the HTML file, replacing '.html' with '.csv'
20 |     csv_file = open(html_file.replace('.html', '.csv'), 'wb')
21 |     output = csv.writer(csv_file)
22 |     output.writerow(['NAME', 'DBA', 'STADDR', 'STADDR2', 'CITY', 'STATE', 'ZIP'])
23 | 
24 |     for lender in list:
25 |         details = lender.split('<br/>')
26 |         name = details[0].strip()
27 |         if len(details) == 3:
28 |             dba = ''
29 |             staddr = details[1].strip()
30 |             staddr2 = ''
31 |         elif len(details) == 4 and details[1].upper().startswith('D/B/A'):
32 |             dba = details[1].strip()
33 |             staddr = details[2].strip()
34 |             staddr2 = ''
35 |         elif len(details) == 4 and not details[1].upper().startswith('D/B/A'):
36 |             dba = ''
37 |             staddr = details[1].strip()
38 |             staddr2 = details[2].strip()
39 |         elif len(details) == 5:
40 |             dba = details[1].strip()
41 |             staddr = details[2].strip()
42 |             staddr2 = details[3].strip()
43 |         citystzip = details[len(details)-1].split(', ')
44 |         city = citystzip[0].strip()
45 |         state = citystzip[1].strip()
46 |         zip = citystzip[2].strip()
47 |         output.writerow([name, dba, staddr, staddr2, city, state, zip])
48 | 
49 |     csv_file.close()
50 | 


--------------------------------------------------------------------------------
/6_from_apis/completed/fun_with_sqlite_done.py:
--------------------------------------------------------------------------------
 1 | # SQLite is a lightweight database manager that's part of Python's standard
 2 | # library, so it's a good example of how to hook a script up to a database.
 3 | # If you work in MySQL or Postgres, there are libraries you can use to make
 4 | # a connection and gain similar functionality.
 5 | 
 6 | import sqlite3
 7 | 
 8 | # Connect to a test database; if one doesn't exist, it will be created on
 9 | # the fly. We also fire up a cursor to poke, prod and manipulate our
10 | # database.
11 | conn = sqlite3.connect('my_test.sqlite')
12 | c = conn.cursor()
13 | 
14 | # Right now it's an empty database with no tables and no data. Let's create
15 | # basic one that holds some CEO information.
16 | c.execute(
17 |          'CREATE TABLE ceos '
18 |          '(ceo_name text, company text, salary int)')
19 | 
20 | # NOTE: with scripts, somestimes it's a good idea to preface a CREATE
21 | # TABLE query with IF NOT EXISTS, that way you won't get an operational
22 | # error.
23 | 
24 | # Let's insert three CEO names, companies and salaries into our ceos table.
25 | c.execute(
26 |          "INSERT INTO ceos "
27 |          "VALUES ('John Smith', 'Acme, Inc.', '275000'), "
28 |          "('Libby Rogers', 'AstroTech', '1200000'), "
29 |          "('Darla Jones', 'Ballard Partners', '942000')")
30 | 
31 | # When we alter a table, we have to commit those changes.
32 | conn.commit()
33 | 
34 | # Let's run a quick query that gives us everything in the table.
35 | c.execute(
36 |          "SELECT * FROM ceos")
37 | 
38 | # The database has run the query and gives it back to use as a list of tuples
39 | # for each row. We have to fetch this information.
40 | result = c.fetchall()
41 | print result
42 | 
43 | # Try fetchall() again; it should be empty and will be until we run another
44 | # query.
45 | c.fetchall()
46 | 
47 | # Let's try another basic query: a sum of the salaries.
48 | c.execute(
49 |          "SELECT SUM(salary) FROM ceos")
50 | result2 = c.fetchall()
51 | print result2
52 | 
53 | # One more: companies that start with 'A,' sorted in descending order by
54 | # salary
55 | c.execute(
56 |          "SELECT * FROM ceos "
57 |          "WHERE company LIKE 'A%' "
58 |          "ORDER BY salary DESC")
59 | result3 = c.fetchall()
60 | print result3
61 | 


--------------------------------------------------------------------------------
/9_other_scrapes/other_scrapes.py:
--------------------------------------------------------------------------------
 1 | #! usr/bin/python
 2 | 
 3 | 
 4 | 
 5 | 
 6 | 
 7 | 
 8 | # We're not always going to be diving into a table or other information tied
 9 | # up in HTML. Sometimes our target is a file or set of files; we can write the content
10 | # to outfiles in a directory of our choosing.
11 | 
12 | # Fetch the page:
13 | # https://portal.chicagopolice.org/portal/page/portal/ClearPath/News/Crime Statistics
14 | 
15 | 
16 | 
17 | 
18 | # Process the HTML with BeautifulSoup
19 | 
20 | 
21 | 
22 | # Find all the links on the page (just for practice)
23 | 
24 | 
25 | # The links we want don't have great characteristics to hook onto; let's just
26 | # find everything where the link contains '.pdf'
27 | 
28 | 
29 | 
30 | # Sometimes it's useful to me to count up what I expect to have. We'll take
31 | # a peek here and use the enumerate function to count. When iterating through
32 | # a list it returns a tuple that looks like (<number>, <list item>) instead
33 | # of just <list item>.
34 | 
35 | 
36 | 
37 | 
38 | # Make an empty list to hold the URLs we're going to pull out of all these links.
39 | 
40 | 
41 | 
42 | # Loop through and grab the URLs.
43 | 
44 | 
45 | 
46 | 
47 | # Now we have a list of PDF files to grab from the page. We're going to write
48 | # the content in each to a file, and we can use information we've gleaned
49 | # from the file or link to name the files that we're writing to disk.
50 | 
51 | # Let's use a dict instead; the key will be the link text and the value will
52 | # be the URL.
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | # We can get the original file name out of the URL by splitting URL at '/' and
60 | # grabbing the last item in the list.
61 | 
62 | 
63 | 
64 | 
65 | # Since the original file names don't contain any information about the District
66 | # or Area, let's add that in.
67 | 
68 | # Also, let's make sure we're not bombarding the site with a ton of requests at
69 | # the same time; we need to briefly pause between downloads.
70 | 
71 | # We'll pass the URL to a variable, get it with requests, do a little tinkering
72 | # with the PDF name and link name to remove spaces (and the URL equivalent),
73 | # then write them to a directory.
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 |     # Make a directory to hold these files
82 | 
83 | 
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/docs/next_steps.md:
--------------------------------------------------------------------------------
 1 | ### Next Steps
 2 | 
 3 | If you're curious about the different paths for installing Python and various libraries on your own computer, some excellent resources exist as part of the [PyCAR](https://github.com/ireapps/pycar/) repository. The [take-home portion](https://github.com/ireapps/pycar/blob/master/takehome/README.md) points to several tutorials and guides that cover Mac and PC setup, how to compartmentalize projects in their own virtual environments and the use of version control.
 4 | 
 5 | If you want to use a virtualenv, we've included a (short) requirements file for ```pip``` with all the non-standard libraries used and their dependencies.
 6 | 
 7 | Here are stops to make as you expand your knowledge and keep practicing these news skills:
 8 | 
 9 |  - Again, the [PyCAR](https://github.com/ireapps/pycar/) repository, a daylong class taught at the 2014 and 2015 NICAR Conferences.
10 |  - Anthony Debarros' [python-get-started](https://github.com/anthonydb/python-get-started) repository and his useful [python-snippets](https://github.com/anthonydb/python-snippets), a list of practical code examples.
11 |  - Ben Welsh's updated guide to building [a web scraper in Python](https://github.com/ireapps/first-web-scraper) and his guide for building [a lightweight news app](https://github.com/ireapps/first-news-app).
12 |  - [LearnPython](http://learnpython.org) is an interactive tutorial that covers a lot of this same ground and more.
13 | 
14 | ### Version control
15 |  
16 | As you start feeling more comfortable with the basics, you may want to start using a version control setup like [git](https://git-scm.com/) to catalog your scripts, collaborate with others and share your code with the world.
17 |  
18 | Here are a couple of guides from NICAR presenters and other sources that may come in handy:
19 | 
20 | - Tom Meagher's [tipsheet on git and GitHub](http://ire.org/resource-center/tipsheets/3863/)
21 | - [Resources and code shared during NICAR15](https://github.com/githubteacher/nicar-2015) from the fine folks at GitHub 
22 | - Lauren Orsini's [guide for beginners](http://readwrite.com/2013/09/30/understanding-github-a-journey-for-beginners-part-1#awesm=~oAh764aNbWfqyH)
23 | - GitHub's own [command cheat sheet](https://training.github.com/kit/downloads/github-git-cheat-sheet.pdf)


--------------------------------------------------------------------------------
/1_intro/whitespace.py:
--------------------------------------------------------------------------------
 1 | # FUN WITH WHITESPACE IN PYTHON
 2 | 
 3 | 
 4 | # Whitespace is critical in Python. Unlike some other scripting languages,
 5 | # which use characters to tell the interpreter where functions and loops
 6 | # end, Python uses structured indentation for new lines, making "blocks" of
 7 | # code.
 8 | 
 9 | my_string = 'New York'
10 | 
11 | print "Start spreading the news,"
12 | 
13 | if my_string == 'New York':
14 |     print "I'm leaving today,"
15 |     print "I want to be a part of it,"
16 |     
17 |     for num in range(0,2):
18 |         print my_string
19 | 
20 | else:
21 |     print "you clearly don't know how this song goes. {}?".format(my_string)
22 |     
23 | # What do you think the above does? Let's step through it.
24 | # (Notice how blank lines between code is A-OK.) 
25 | 
26 | 
27 | # Some other places indentation and whitespace don't matter much:
28 | 
29 | # When assigning items to a list or a string; the below is ugly, but sometimes
30 | # it's more readable in a script to define things on different lines.
31 | 
32 | list_of_cities = [
33 | 
34 |             'Buffalo',
35 |             
36 |                   'Key West',
37 |                 'Fort Collins',     'Bakersfield'     ]
38 | 
39 | wordy_string = "Four score and seven years ago, our fathers brought" \
40 |     " forth on this continent ... hmm. I" \
41 |             " am desperately trying to remember what Abraham Lincoln" \
42 |         " said, because it was one of the most important and" \
43 |         " and influentual speeches in modern history; I've even" \
44 |                    " been to Gettysburg. Wow, this is pretty embarrasing."
45 | 
46 | 
47 | # Tabs and spaces. Don't mix them. The interpreter will choke on it. Style
48 | # dictates that you use four spaces instead of tabs. I generally set up my
49 | # text editor to replace tabs on the fly or do it after I'm done with my
50 | # script, because I much prefer hitting tab once instead of space four times.
51 | 
52 | print "Start spreading the news,"
53 | if my_string == 'New York':
54 |     print "I'm leaving today,"
55 | 	print "I want to be a part of it,"
56 |     for num in range(0,2):
57 |         print my_string
58 | else:
59 |     print "you clearly don't know how this song goes. {}?".format(my_string)
60 | 
61 | # The above looks fine, right? You will get an IndentationError. Most text
62 | # editors have a function
63 | 


--------------------------------------------------------------------------------
/site/css/highlight.css:
--------------------------------------------------------------------------------
  1 | /*
  2 | This is the GitHub theme for highlight.js
  3 | 
  4 | github.com style (c) Vasily Polovnyov <vast@whiteants.net>
  5 | 
  6 | */
  7 | 
  8 | .hljs {
  9 |   display: block;
 10 |   overflow-x: auto;
 11 |   padding: 0.5em;
 12 |   color: #333;
 13 |   -webkit-text-size-adjust: none;
 14 | }
 15 | 
 16 | .hljs-comment,
 17 | .diff .hljs-header,
 18 | .hljs-javadoc {
 19 |   color: #998;
 20 |   font-style: italic;
 21 | }
 22 | 
 23 | .hljs-keyword,
 24 | .css .rule .hljs-keyword,
 25 | .hljs-winutils,
 26 | .nginx .hljs-title,
 27 | .hljs-subst,
 28 | .hljs-request,
 29 | .hljs-status {
 30 |   color: #333;
 31 |   font-weight: bold;
 32 | }
 33 | 
 34 | .hljs-number,
 35 | .hljs-hexcolor,
 36 | .ruby .hljs-constant {
 37 |   color: #008080;
 38 | }
 39 | 
 40 | .hljs-string,
 41 | .hljs-tag .hljs-value,
 42 | .hljs-phpdoc,
 43 | .hljs-dartdoc,
 44 | .tex .hljs-formula {
 45 |   color: #d14;
 46 | }
 47 | 
 48 | .hljs-title,
 49 | .hljs-id,
 50 | .scss .hljs-preprocessor {
 51 |   color: #900;
 52 |   font-weight: bold;
 53 | }
 54 | 
 55 | .hljs-list .hljs-keyword,
 56 | .hljs-subst {
 57 |   font-weight: normal;
 58 | }
 59 | 
 60 | .hljs-class .hljs-title,
 61 | .hljs-type,
 62 | .vhdl .hljs-literal,
 63 | .tex .hljs-command {
 64 |   color: #458;
 65 |   font-weight: bold;
 66 | }
 67 | 
 68 | .hljs-tag,
 69 | .hljs-tag .hljs-title,
 70 | .hljs-rule .hljs-property,
 71 | .django .hljs-tag .hljs-keyword {
 72 |   color: #000080;
 73 |   font-weight: normal;
 74 | }
 75 | 
 76 | .hljs-attribute,
 77 | .hljs-variable,
 78 | .lisp .hljs-body,
 79 | .hljs-name {
 80 |   color: #008080;
 81 | }
 82 | 
 83 | .hljs-regexp {
 84 |   color: #009926;
 85 | }
 86 | 
 87 | .hljs-symbol,
 88 | .ruby .hljs-symbol .hljs-string,
 89 | .lisp .hljs-keyword,
 90 | .clojure .hljs-keyword,
 91 | .scheme .hljs-keyword,
 92 | .tex .hljs-special,
 93 | .hljs-prompt {
 94 |   color: #990073;
 95 | }
 96 | 
 97 | .hljs-built_in {
 98 |   color: #0086b3;
 99 | }
100 | 
101 | .hljs-preprocessor,
102 | .hljs-pragma,
103 | .hljs-pi,
104 | .hljs-doctype,
105 | .hljs-shebang,
106 | .hljs-cdata {
107 |   color: #999;
108 |   font-weight: bold;
109 | }
110 | 
111 | .hljs-deletion {
112 |   background: #fdd;
113 | }
114 | 
115 | .hljs-addition {
116 |   background: #dfd;
117 | }
118 | 
119 | .diff .hljs-change {
120 |   background: #0086b3;
121 | }
122 | 
123 | .hljs-chunk {
124 |   color: #aaa;
125 | }
126 | 


--------------------------------------------------------------------------------
/site/sitemap.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  3 | 
  4 |     
  5 |     <url>
  6 |      <loc>None/</loc>
  7 |      <lastmod>2015-11-10</lastmod>
  8 |      <changefreq>daily</changefreq>
  9 |     </url>
 10 |     
 11 | 
 12 |     
 13 |     <url>
 14 |      <loc>None/install/</loc>
 15 |      <lastmod>2015-11-10</lastmod>
 16 |      <changefreq>daily</changefreq>
 17 |     </url>
 18 |     
 19 | 
 20 |     
 21 |     <url>
 22 |      <loc>None/pt1/</loc>
 23 |      <lastmod>2015-11-10</lastmod>
 24 |      <changefreq>daily</changefreq>
 25 |     </url>
 26 |     
 27 | 
 28 |     
 29 |     <url>
 30 |      <loc>None/pt2/</loc>
 31 |      <lastmod>2015-11-10</lastmod>
 32 |      <changefreq>daily</changefreq>
 33 |     </url>
 34 |     
 35 | 
 36 |     
 37 |     <url>
 38 |      <loc>None/pt3/</loc>
 39 |      <lastmod>2015-11-10</lastmod>
 40 |      <changefreq>daily</changefreq>
 41 |     </url>
 42 |     
 43 | 
 44 |     
 45 |     <url>
 46 |      <loc>None/pt4/</loc>
 47 |      <lastmod>2015-11-10</lastmod>
 48 |      <changefreq>daily</changefreq>
 49 |     </url>
 50 |     
 51 | 
 52 |     
 53 |     <url>
 54 |      <loc>None/pt5/</loc>
 55 |      <lastmod>2015-11-10</lastmod>
 56 |      <changefreq>daily</changefreq>
 57 |     </url>
 58 |     
 59 | 
 60 |     
 61 |     <url>
 62 |      <loc>None/pt6/</loc>
 63 |      <lastmod>2015-11-10</lastmod>
 64 |      <changefreq>daily</changefreq>
 65 |     </url>
 66 |     
 67 | 
 68 |     
 69 |     <url>
 70 |      <loc>None/pt7/</loc>
 71 |      <lastmod>2015-11-10</lastmod>
 72 |      <changefreq>daily</changefreq>
 73 |     </url>
 74 |     
 75 | 
 76 |     
 77 |     <url>
 78 |      <loc>None/pt8/</loc>
 79 |      <lastmod>2015-11-10</lastmod>
 80 |      <changefreq>daily</changefreq>
 81 |     </url>
 82 |     
 83 | 
 84 |     
 85 |     <url>
 86 |      <loc>None/pt9/</loc>
 87 |      <lastmod>2015-11-10</lastmod>
 88 |      <changefreq>daily</changefreq>
 89 |     </url>
 90 |     
 91 | 
 92 |     
 93 |     <url>
 94 |      <loc>None/pt10/</loc>
 95 |      <lastmod>2015-11-10</lastmod>
 96 |      <changefreq>daily</changefreq>
 97 |     </url>
 98 |     
 99 | 
100 |     
101 |     <url>
102 |      <loc>None/next_steps/</loc>
103 |      <lastmod>2015-11-10</lastmod>
104 |      <changefreq>daily</changefreq>
105 |     </url>
106 |     
107 | 
108 | </urlset>


--------------------------------------------------------------------------------
/2_web_scrape/scrape_pt2.py:
--------------------------------------------------------------------------------
 1 | # PROBLEM: We scraped our table, but there's actually information on the detail page
 2 | # we want to have in our result.
 3 | #
 4 | # HOW WE'RE GOING TO DEAL WITH IT:
 5 | #   - Do everything we did before: fetch a page, navigate it and output to csv
 6 | #   - Refine our script to dip into the detail page for each reactor
 7 | #   - Use pattern matching to isolate two additional data points for our csv
 8 | #   -
 9 | 
10 | # Let's add Python's regex and time libraries.
11 | import requests
12 | from bs4 import BeautifulSoup
13 | import csv
14 | 
15 | 
16 | 
17 | url = 'http://www.nrc.gov/reactors/operating/list-power-reactor-units.html'
18 | 
19 | web_page = requests.get(url)
20 | web_page.encoding = 'UTF-8'
21 | soup = BeautifulSoup(web_page.text, 'html.parser')
22 | 
23 | reactor_table = soup.find('table')
24 | 
25 | # We'll send stuff to a different file.
26 | csv_file = open('reactors.csv', 'wb')
27 | output = csv.writer(csv_file)
28 | # Add the two new fields we'll grab from the detail page to the header row.
29 | output.writerow(['NAME', 'LINK', 'DOCKET', 'TYPE', 'LOCATION', 'OWNER', 'REGION'])
30 | 
31 | for row in reactor_table.find_all('tr')[1:]:
32 |     cell = row.find_all('td')
33 |     name = cell[0].contents[0].string
34 |     link = cell[0].contents[0].get('href')
35 |     docket = cell[0].contents[2].strip()
36 |     type = cell[1].string
37 |     location = cell[2].string.encode('utf8')
38 |     owner = cell[3].contents[0].encode('utf8')
39 |     region = cell[4].string
40 | 
41 |     # Let's get the reactor's detail page and get a quick note about it.
42 | 
43 | 
44 | 
45 |     # Use pattern matching to find the text after 'Licensed MwT:'
46 |     # '(?i)licensed mwt:\s*</strong>(.*)<' decoded:
47 |     # (?i) = ignore case
48 |     # \s* = spaces may exist
49 |     # () = the text we care about
50 |     # .* = all characters, including text, numbers and punctuation
51 | 
52 |     # Send the match to a variable.
53 | 
54 | 
55 |     # Do the same thing for the text after 'Containment Type:'
56 |     # '(?i)containment type:\s*</strong>(.*?)\s*<'
57 | 
58 | 
59 | 
60 |     # Add these two new fields to the csv.writer's output.
61 |     output.writerow([name, link, docket, type, location, owner, region])
62 | 
63 |     # Let's slow down how quickly we're sending these requests for detail pages.
64 |     time.sleep(2)
65 | 
66 | 
67 | csv_file.close()
68 | print 'All done!'
69 | 


--------------------------------------------------------------------------------
/8_cleaning/names.py:
--------------------------------------------------------------------------------
  1 | #! usr/bin/python
  2 | 
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 
 17 | # Quick check to make sure we're getting all of our links before we move on
 18 | 
 19 | 
 20 | 
 21 | 
 22 | # Let's look at the URL string
 23 | 
 24 | 
 25 | 
 26 | # What we really care about are the numbers at the end of the link; that's
 27 | # how the whole setup for the website works. We feed it an ID, it generates
 28 | # a page.
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | 
 35 |     # We only need the number at the end; we can split the URL and grab it
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | # Just as a test, let's look at the first one to see what we're dealing with.
 43 | 
 44 | 
 45 | 
 46 | 
 47 | # Two tables on each page, both are functionally identical to BeautifulSoup,
 48 | # so we just grab both and take the second.
 49 | 
 50 | 
 51 | 
 52 | 
 53 | # The names are coming from a db that's dynamically combining them; we can use that.
 54 | # All the components are inside of label tags. If we try to extract the text, we're
 55 | # met with a mess of unicode junk (non-breaking spaces, in this case) meant to
 56 | # glue the title, first name, middle initial and last name together.
 57 | 
 58 | 
 59 | 
 60 | # Pull the name and clean it up with split(), which by default will work on
 61 | # whitespace. .join and .split together can be handy for cleaning.
 62 | 
 63 | 
 64 | 
 65 | 
 66 | # Depending on how we need the name to be parsed, it might be better for us
 67 | # to chop it up into components.
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | # Now that we see how this works, we can write a full run. We'll feed
 74 | # requests.get a page, parse it with BeautifulSoup, and write it to CSV.
 75 | 
 76 | # We can use the 'with' syntax here to open the file
 77 | 
 78 | 
 79 | 
 80 | 
 81 |         # We can basically use what we've done above to zero in on the table
 82 | 
 83 | 
 84 | 
 85 | 
 86 |         # Another for loop to move inside each row (except the header)
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 |             # If it's a vacancy, let's not go through the trouble of splitting.
 94 | 
 95 | 
 96 | 
 97 |                 # Slot the different components here
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 |                 # Last name is currently including suffixes like Jr, Sr, III, etc.
105 |                 # We can look for a comma that denotes the suffix.
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/2_web_scrape/fun_with_regex.py:
--------------------------------------------------------------------------------
 1 | # Regular expressions are a powerful tool for pattern matching when you
 2 | # know the general format of what you're trying to find but want to keep
 3 | # it loose in terms of actual content: think finding email addresses or
 4 | # phone numbers based on what they have in common with each other. Python
 5 | # has a standard library that deals with it.
 6 | 
 7 | import re
 8 | 
 9 | #
10 | 
11 | records = [
12 |           'April 13, 2013 Cyberdyne Systems $4,000.00 18144 El Camino '
13 |           'Real, Sunnyvale, CA 94087 (408) 555-1234 info@cyberdyne.com '
14 |           'December 2, 2018 December 14, 2018',
15 | 
16 |           'May 4, 2013 Sam Fuzz, Inc. $6,850.50 939 Walnut St, San '
17 |           'Carlos, CA 94070 (408) 555-0304 ceo@samfuzz.net January 28'
18 |           ', 2016 February 15, 2016']
19 | 
20 | # Find the word 'Sunnyvale' in the first record with re.search()
21 | 
22 | 
23 | # Find the first date in the first record. Let's pick apart the pattern:
24 | # 1. \w matches upper/lowercase A-Z and digits 0-9, good for text.
25 | # 2. {3,} matches three or more (shortest possible month is May)
26 | # 3. \s matches whitespace, good for spaces and tabs
27 | # 4. {1} matches exactly one
28 | # 5. \d matches 0-9
29 | # 6. {1,2} matches at least one, but no more than 2
30 | # 7. , matches the comma in the date
31 | # 8. \s{1}: again, one space or tab
32 | # 9. \d{4} matches four digits.
33 | 
34 | 
35 | # Do the same thing but wrap some parentheses around the month, day and year
36 | # patterns and re.search().group(0) to return the whole date.
37 | 
38 | 
39 | 
40 | # Try 1, 2 and 3 to cycle through month, day and year.
41 | 
42 | 
43 | 
44 | 
45 | # Grab all the dates in the first record with re.findall().
46 | 
47 | 
48 | # Print them out with a for loop
49 | 
50 | 
51 | 
52 | # Pick out and print dollar amounts from the records.
53 | # . matches any character, * matches any number of times
54 | 
55 | 
56 | 
57 | 
58 | # Try to do the same thing for the phone numbers.
59 | 
60 | 
61 | 
62 | 
63 | # How would I isolate something like a company name that's totally variable?
64 | # Think about the hooks you have on either side; the pattern you want to
65 | # match here has to do with what's around it.
66 | 
67 | 
68 | 
69 | 
70 | # We can also substitute based on a pattern. Give everyone an '.info'
71 | # email address via print and re.sub().
72 | 
73 | 
74 | 
75 | # If you have multiple character possibilities that act as delimiters for a
76 | # string you want to break apart, re.split() can come in handy.
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/docs/pt7.md:
--------------------------------------------------------------------------------
 1 | #### Unlocking data from databases
 2 | 
 3 | Most of the internet demands data in forms that play nicely with it — especially for visualization. JavaScript libraries that dynamically assemble charts, graphs and maps typically want something that looks a lot more like the JSON we got from the crime API over a CSV file or a database table. And that's OK, because it just gives Python another chance to shine.
 4 | 
 5 | We're going to write a few scripts that convert our crime data into [GeoJSON](http://geojson.org/), which is designed specifically to store geographic data like points, lines and polygons, as well as data that may be associated with those shapes.
 6 | 
 7 | From there, we're going to layer them on a very basic [Leaflet.js](http://leafletjs.com/) map that was hacked together from a few of their [tutorials](http://leafletjs.com/examples.html). 
 8 | 
 9 | Instead of having to load data into a GIS program like Esri's ArcGIS or the open-source QGIS and spend the time manually joining a table from a database to a shapefile and then exporting the whole affair to a web-friendly format, we'll handle it two ways.
10 | 
11 | First, we'll write a section of script so that Python will generate GeoJSON straight from a database table. We'll use a new data type, [```OrderedDict```](https://docs.python.org/2/library/collections.html#collections.OrderedDict), to store key/value pairs as well as the order in which they are added.
12 | 
13 | The next section of the Python script will process an existing GeoJSON polygon shapefile. It will check the ID of each polygon against an existing data set; if it finds a match, it will write new data into the shape.
14 | 
15 | The whole point here is that you can have Python acting as an autonomous data depot, pulling down new data from one side and sending it to update dynamic visualizations from the other.
16 | 
17 | We'll use the following files for this project:
18 | 
19 | - **to_json.py**: The script we'll write to make and alter GeoJSON files based on information in our existing crime.db.
20 | 
21 | - **chicago_crime.html**: An HTML file that contains JavaScript. It uses Leaflet.js to draw a map and other functions that are part of that library to incorporate GeoJSON files as map layers.
22 | 
23 | - **comm_areas.geojson**: A GeoJSON file that contains all of Chicago's community areas. This originally existed as an Esri shapefile but was converted to this format in QGIS; there are also command line tools available that can do this, like [ogr2ogr](http://www.gdal.org/ogr2ogr.html).
24 | 
25 | A finished version of the Python script is in **completed**. 


--------------------------------------------------------------------------------
/docs/pt6.md:
--------------------------------------------------------------------------------
 1 | #### APIs and databases
 2 | 
 3 | Scraping isn't the only method for retrieving data; sometimes data resources on the internet — even one maintained by government agencies — actually want to give it away in a usable format. Web application rogramming interfaces exist to make this happen. More often than not, you pass a query through an API's URL and get a response in [JSON](http://www.json.org/) (JavaScript Object Notation) format, which is highly structured and can look a lot like a Python list or dictionary.
 4 | 
 5 | We're going to query a [Socrata](http://www.socrata.com/) API maintained by the city of Chicago. This [particular data source](https://dev.socrata.com/foundry/#/data.cityofchicago.org/ijzp-q8t2/no-redirect) is where crime incidents are reported by the Chicago Police Department; the lag time for reporting is about one week.
 6 | 
 7 | Our initial goal is to grab the most recent week of crimes in the system, which means pinging the API to figure out the date of the last crime and then grabbing every reported crime on that day and the previous six. This will involve some work with ```datetime``` and date math in Python. 
 8 | 
 9 | We're also going to inspect the incoming stream of JSON from the API to make sure we can tell if any are missing fields. This becomes important because we are going to feed each crime event into an [SQLite](https://docs.python.org/2/library/sqlite3.html) database.
10 | 
11 | SQLite may not be as robust as some other database managers, but it's part of the standard library in later versions of Python 2.7 and creates lightweight database files that are easy to connect to and query.
12 | 
13 | We'll write some automated queries for crime trends, including which beats in Chicago had the most reported narcotics crimes during the week and what the violent crime rate was in different communities.
14 | 
15 | This particular task uses these files:
16 | 
17 | - **API.py**: The main script that fetches data from the crime API, puts it in a SQLite database and runs queries on the data.
18 | 
19 | - **crime.db**: A SQLite database that only has a table with population data for Chicago's various community areas. We'll add to it from the crime API.
20 | 
21 | - **fun_with_datetime.py**: A more focused look at datetime objects and how they work in Python.
22 | 
23 | - **fun_with_sqlite.py**: The fundamentals of connecting to, modifying and querying a database with Python.
24 | 
25 | - **crime_backup.json.zip**: Shoddy internet or a downed website won't ruin this exercise; this is compressed JSON that represents a week of Chicago crime.
26 | 
27 | Finished versions are in the **completed** folder.


--------------------------------------------------------------------------------
/2_web_scrape/completed/scrape_done.py:
--------------------------------------------------------------------------------
 1 | # PROBLEM: We want to scrape the contents of a table into a delimited file.
 2 | #
 3 | # HOW WE'RE GOING TO DEAL WITH IT:
 4 | #   - Use the requests library to grab the page source
 5 | #   - Use BeautifulSoup to navigate to the table and then loop through its rows
 6 | #   - Write it all to a csv file
 7 | #   - Handle some encoding issues
 8 | 
 9 | # Let's import the libraries we'll be using for this parsing task: BeautifulSoup from bs4,
10 | # Python's csv library and requests
11 | import requests
12 | from bs4 import BeautifulSoup
13 | import csv
14 | 
15 | # The data table we want is at
16 | # 'http://www.nrc.gov/reactors/operating/list-power-reactor-units.html'
17 | url = 'http://www.nrc.gov/reactors/operating/list-power-reactor-units.html'
18 | 
19 | # Fetch the page with requests, set the page encoding and turn it into a BeautifulSoup
20 | # object.
21 | web_page = requests.get(url)
22 | web_page.encoding = 'UTF-8'
23 | soup = BeautifulSoup(web_page.text, 'html.parser')
24 | 
25 | # This page only has one table, so it's not much work to find it.
26 | reactor_table = soup.find('table')
27 | 
28 | # Let's make a new, empty csv file and a csv.writer which will take write our
29 | # table one row at a time to the file.
30 | csv_file = open('reactors.csv', 'wb')
31 | output = csv.writer(csv_file)
32 | # The first row written will be our field names.
33 | output.writerow(['NAME', 'LINK', 'DOCKET', 'TYPE', 'LOCATION', 'OWNER', 'REGION'])
34 | 
35 | # We want to loop through all <tr> tags (rows) except for the header.
36 | for row in reactor_table.find_all('tr')[1:]:
37 |     # Each <tr> has some <td> (cells) below it; these are what we'll move into variables
38 |     # and then write to the csv.
39 |     cell = row.find_all('td')
40 |     # Reactor name, detail page link and docket number are all part of the first cell.
41 |     # Docket has a bunch of whitespace, so we'll .strip() it.
42 |     name = cell[0].contents[0].string
43 |     link = cell[0].contents[0].get('href')
44 |     docket = cell[0].contents[2].strip()
45 |     type = cell[1].string
46 |     # Two fields in this table have characters like en dash; we need to make sure these are
47 |     # encoded properly when writing to the csv or it will break our script.
48 |     location = cell[2].string.encode('utf8')
49 |     owner = cell[3].contents[0].encode('utf8')
50 |     region = cell[4].string
51 | 
52 |     # Once everything's collected, write it as a row in the csv.
53 |     output.writerow([name, link, docket, type, location, owner, region])
54 | 
55 | # Close the file and let us know it's finished.
56 | csv_file.close()
57 | print 'All done!'
58 | 


--------------------------------------------------------------------------------
/docs/pt2.md:
--------------------------------------------------------------------------------
 1 | ### Scraping the web
 2 | 
 3 | Now that we've familiarized ourselves with the ways Python works, we have a little bit of a foundation to build from. Nearly everything else we do today is going to be using the fundamentals from [the intro](pt1.md) to varying degrees and in different combinations to create longer scripts.
 4 | 
 5 | So let's scrape a web page. We want to collect all the data from the main table on the U.S. Nuclear Regulatory Commission's [list of domestic power reactor units](http://www.nrc.gov/reactors/operating/list-power-reactor-units.html).
 6 | 
 7 | Python comes with a library installed that's designed specifically for reading and writing CSV files ([```csv```](https://docs.python.org/2/library/csv.html)), but we're also going to need to extend Python's functionality a bit by bringing in two other libraries.
 8 | 
 9 | One is [```requests```](http://docs.python-requests.org/en/latest/) -- it handles the job of playing a web browser that can fetch a web page and send back the underlying HTML. The other is [```BeautifulSoup```](http://www.crummy.com/software/BeautifulSoup/), which parses the HTML into what amounts to a series of lists that we can then search, navigate and extract data from.
10 | 
11 | When we get to part two, we'll use the built-in regular expressions library [```re```](https://docs.python.org/2/library/re.html) to isolate some text from the detail pages and [```time```](https://docs.python.org/2/library/time.html) to keep us from swamping a government site with too many requests at once.
12 | 
13 | A big thank you to [Anthony DeBarros](https://twitter.com/anthonydb) for allowing us to present a modified version of his web scraping example from [python-get-started](https://github.com/anthonydb/python-get-started).
14 | 
15 | We'll use the following files:
16 | 
17 | - **scrape.py**: The file we'll use to write our scraping script, following the comments.
18 | 
19 | - **scrape_pt2.py**: The file we'll use to push our scraping script further; it contains finished code for **scrape.py** and open spots to add code that loops through to detail pages and collects additional information.
20 | 
21 | - **nrc_backup.html**: A backup version of the main table we want to scrape in case there's a connection problem.
22 | 
23 | - **table_example.html**: A bare bones HTML table that shows the basic tags and how they're nested, with the flourishes of a modern web page stripped away -- it's ugly.
24 | 
25 | - **fun_with_bs.py**: A primer for some of BeautifulSoup's most relevant commands for navigating HTML.
26 | 
27 | - **fun_with_csv.py**: A brief example of how Python uses its standard csv library to read and write delimited-text files.
28 | 
29 | - **fun_with_regex.py**: A file that covers some regular expresses in Python for finding and isolating text.
30 | 
31 | Finished versions will appear in the **completed** folder.


--------------------------------------------------------------------------------
/2_web_scrape/completed/scrape_pt2_done.py:
--------------------------------------------------------------------------------
 1 | # PROBLEM: We scraped our table, but there's actually information on the detail page
 2 | # we want to have in our result.
 3 | #
 4 | # HOW WE'RE GOING TO DEAL WITH IT:
 5 | #   - Do everything we did before: fetch a page, navigate it and output to csv
 6 | #   - Refine our script to dip into the detail page for each reactor
 7 | #   - Use pattern matching to isolate two additional data points for our csv
 8 | #   -
 9 | 
10 | # Let's add Python's regex and time libraries.
11 | import requests
12 | from bs4 import BeautifulSoup
13 | import csv
14 | import re
15 | import time
16 | 
17 | url = 'http://www.nrc.gov/reactors/operating/list-power-reactor-units.html'
18 | 
19 | web_page = requests.get(url)
20 | web_page.encoding = 'UTF-8'
21 | soup = BeautifulSoup(web_page.text, 'html.parser')
22 | 
23 | reactor_table = soup.find('table')
24 | 
25 | # We'll send stuff to a different file.
26 | csv_file = open('reactors_more.csv', 'wb')
27 | output = csv.writer(csv_file)
28 | # Add the two new fields we'll grab from the detail page to the header row.
29 | output.writerow(['NAME', 'LINK', 'DOCKET', 'TYPE', 'LOCATION', 'OWNER', 'REGION', 'MWT', 'CONTAINMENT'])
30 | 
31 | for row in reactor_table.find_all('tr')[1:]:
32 |     cell = row.find_all('td')
33 |     name = cell[0].contents[0].string
34 |     link = cell[0].contents[0].get('href')
35 |     docket = cell[0].contents[2].strip()
36 |     type = cell[1].string
37 |     location = cell[2].string.encode('utf8')
38 |     owner = cell[3].contents[0].encode('utf8')
39 |     region = cell[4].string
40 | 
41 |     # Let's get the reactor's detail page and get a quick note about it.
42 |     print 'Fetching detail from '+link
43 |     detail_page = requests.get('http://www.nrc.gov'+link)
44 | 
45 |     # Use pattern matching to find the text after 'Licensed MwT:'
46 |     # '(?i)licensed mwt:\s*</strong>(.*)<' decoded:
47 |     # (?i) = ignore case
48 |     # \s* = spaces may exist
49 |     # () = the text we care about
50 |     # .* = all characters, including text, numbers and punctuation
51 |     mwt_search = re.search('(?i)licensed mwt:\s*</strong>(.*)<', detail_page.text)
52 |     # Send the match to a variable.
53 |     mwt = mwt_search.group(1).strip()
54 | 
55 |     # Do the same thing for the text after 'Containment Type:'
56 |     # '(?i)containment type:\s*</strong>(.*?)\s*<'
57 |     containment_search = re.search('(?i)containment type:\s*</strong>(.*?)\s*<', detail_page.text)
58 |     containment = containment_search.group(1).strip()
59 | 
60 |     # Add these two new fields to the csv.writer's output.
61 |     output.writerow([name, link, docket, type, location, owner, region, mwt, containment])
62 | 
63 |     # Let's slow down how quickly we're sending these requests for detail pages.
64 |     time.sleep(2)
65 | 
66 | 
67 | csv_file.close()
68 | print 'All done!'
69 | 


--------------------------------------------------------------------------------
/5_geocode/completed/geocode_done.py:
--------------------------------------------------------------------------------
 1 | # PROBLEM: We need coordinates for the addresses we parsed earlier.
 2 | #
 3 | # HOW WE'RE GOING TO DEAL WITH IT:
 4 | #   - Use a library called geopy to run an address through Google's geocoder
 5 | #   - Use a sleep function to pause so we don't swamp Google
 6 | #   - Keep track of it all with a keyed dictionary (coming in and going out)
 7 | #   - Put the original information AND our returned lat/long coordinates into a
 8 | #     new csv file
 9 | 
10 | # Import the Google geocoder from geopy as well as Python's csv and time libaries
11 | from geopy.geocoders import GoogleV3
12 | import csv
13 | import time
14 | 
15 | # Make a geolocator object
16 | geolocator = GoogleV3()
17 | 
18 | # Open our address file and start a DictReader object that will give each element in
19 | # each row a key/value pair based on the header columns in the file.
20 | address_file = open('payday_lenders.csv', 'rb')
21 | input = csv.DictReader(address_file)
22 | 
23 | # We'll go ahead and set up a new file for our eventual output.
24 | geocoded_file = open('payday_geocoded.csv', 'wb')
25 | # For the DictWriter, we have to give it a list of fields from the get-go to establish
26 | # the order; we'll go ahead and write a header to the file, too.
27 | output_fields = ['NAME', 'DBA', 'STADDR', 'STADDR2', 'CITY', 'STATE', 'ZIP', 'MATCH_ADDR', 'LAT_Y', 'LONG_X']
28 | output = csv.DictWriter(geocoded_file, output_fields)
29 | output.writeheader()
30 | 
31 | # Start for loop here
32 | for row in input:
33 |     # We're going to put an if/else here to prevent the whole class from launching a
34 |     # volley of 500 requests at Google. Let's get the first five (row 1 is the header).
35 |     if input.line_num <= 6:
36 |         # Put the address in a Google-recognizable string: ADDRESS, CITY, STATE ZIP
37 |         addr = (row['STADDR']+' '+row['STADDR2']).strip()+', '+row['CITY']+', '+row['STATE']+' '+row['ZIP']
38 |         # Geocode that string
39 |         location = geolocator.geocode(addr)
40 |         # Plug results from the geocoder right back into the same row of data with new key
41 |         # values: the returned latitude, longitude and address Google matched on.
42 |         row['LAT_Y'] = location.latitude
43 |         row['LONG_X'] = location.longitude
44 |         row['MATCH_ADDR'] = location.address
45 |         # Write the modified row to our new csv.
46 |         output.writerow(row)
47 |         # To keep tabs on what's happening, get a printed message with address and line.
48 |         print 'Attempted geocode of '+addr+', row '+str(input.line_num)
49 |         # Before we do all of this with the next row, pause for two seconds.
50 |         time.sleep(2)
51 |     else:
52 |         break
53 | 
54 | # Alert us with a printed message when this completes and close both files.
55 | print 'All done!'
56 | address_file.close()
57 | geocoded_file.close()
58 | 


--------------------------------------------------------------------------------
/7_out_of_dbs/chicago_crime.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | 
 3 | <!--
 4 | So this is a quick example of throwing GeoJSON onto a Leaflet map. It is literally
 5 | hacked together from tutorials on their website (http://leafletjs.com/) and uses
 6 | tiles from OpenStreetMap.
 7 | -->
 8 | 
 9 | <head>
10 | 	<title>Chicago Violent Crime and Homicide Map</title>
11 | 	<meta charset="utf-8" />
12 | 	<link rel="stylesheet" href="http://cdn.leafletjs.com/leaflet-0.7.5/leaflet.css" />
13 | </head>
14 | <body>
15 | 
16 | 	<div id="map" style="width: 450px; height: 500px"></div>
17 | 	
18 | 	<!--
19 | 	I didn't want to have to include another JS library or plugin to load the
20 | 	GeoJSON, so we're just treating them like additional JS files and declaring them
21 | 	as variables. I find this dumb and weird, but it works.
22 | 	-->
23 | 	
24 | 	<script src="http://cdn.leafletjs.com/leaflet-0.7.5/leaflet.js"></script>
25 | 	<script src="comm_area_plusrate.geojson" type="text/javascript"></script>
26 | 	<script src="homicide.geojson" type="text/javascript"></script>
27 | 	<script>
28 | 		
29 | 		// Create a map centered on coordinates and zoomed to a specific level
30 | 		var map = L.map('map').setView([41.8457876, -87.6828846], 10);
31 | 		
32 | 		// Grab tiles from OpenStreetMap so we don't have a featureless gray box.
33 | 		L.tileLayer('http://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', {
34 | 			maxZoom: 18,
35 | 			attribution: 'Map data © <a href="http://openstreetmap.org">OpenStreetMap</a> contributors'
36 | 		}).addTo(map);
37 | 		
38 | 		// Adds our generated homicide locations for the week.
39 | 		L.geoJson(homicide_points, {
40 |     		onEachFeature: onEachFeature
41 | 		}).addTo(map);
42 | 		
43 | 		// This function makes a popup for our community map.
44 | 		function onEachFeature(feature, layer) {
45 | 			// popup for COMMUNITY if exists
46 | 			if (feature.properties && feature.properties.COMMUNITY) {
47 | 				layer.bindPopup(feature.properties.COMMUNITY+'<br><br>'+feature.properties.VC_RATE+' violent crimes per 10,000 residents');
48 | 			}
49 | 		}
50 | 
51 | 		// This function shades the map based on different values.
52 | 		function getColor(d) {
53 | 			return d > 20 ? '#a50f15' :
54 | 				   d > 15 ? '#de2d26' :
55 | 				   d > 10 ? '#fb6a4a' :
56 | 				   d > 5  ? '#fcae91' :
57 | 				   d > 1  ? '#fee5d9' :
58 | 							'#fee5d9';
59 | 		}
60 | 		
61 | 		// This function styles the communities based on the above function.	
62 | 		function styleCA(feature) {
63 | 			return {
64 | 				fillColor: getColor(feature.properties.VC_RATE),
65 | 				weight: 2,
66 | 				opacity: 1,
67 | 				color: 'white',
68 | 				dashArray: '3',
69 | 				fillOpacity: 0.7
70 | 			}
71 | 		}
72 |     	// Get our GeoJSON from a variable declared in comm_area_plusrate.geojson
73 | 		L.geoJson(comm, {
74 | 			style: styleCA,
75 |     		onEachFeature: onEachFeature
76 | 		}).addTo(map);
77 | 
78 | 	</script>
79 | 	
80 | <body>
81 | 
82 | </html>


--------------------------------------------------------------------------------
/site/mkdocs/js/search.js:
--------------------------------------------------------------------------------
 1 | require([
 2 |     base_url + '/mkdocs/js/mustache.min.js',
 3 |     base_url + '/mkdocs/js/lunr-0.5.7.min.js',
 4 |     'text!search-results-template.mustache',
 5 |     'text!../search_index.json',
 6 | ], function (Mustache, lunr, results_template, data) {
 7 |    "use strict";
 8 | 
 9 |     function getSearchTerm()
10 |     {
11 |         var sPageURL = window.location.search.substring(1);
12 |         var sURLVariables = sPageURL.split('&');
13 |         for (var i = 0; i < sURLVariables.length; i++)
14 |         {
15 |             var sParameterName = sURLVariables[i].split('=');
16 |             if (sParameterName[0] == 'q')
17 |             {
18 |                 return decodeURIComponent(sParameterName[1].replace(/\+/g, '%20'));
19 |             }
20 |         }
21 |     }
22 | 
23 |     var index = lunr(function () {
24 |         this.field('title', {boost: 10});
25 |         this.field('text');
26 |         this.ref('location');
27 |     });
28 | 
29 |     data = JSON.parse(data);
30 |     var documents = {};
31 | 
32 |     for (var i=0; i < data.docs.length; i++){
33 |         var doc = data.docs[i];
34 |         doc.location = base_url + doc.location;
35 |         index.add(doc);
36 |         documents[doc.location] = doc;
37 |     }
38 | 
39 |     var search = function(){
40 | 
41 |         var query = document.getElementById('mkdocs-search-query').value;
42 |         var search_results = document.getElementById("mkdocs-search-results");
43 |         while (search_results.firstChild) {
44 |             search_results.removeChild(search_results.firstChild);
45 |         }
46 | 
47 |         if(query === ''){
48 |             return;
49 |         }
50 | 
51 |         var results = index.search(query);
52 | 
53 |         if (results.length > 0){
54 |             for (var i=0; i < results.length; i++){
55 |                 var result = results[i];
56 |                 doc = documents[result.ref];
57 |                 doc.base_url = base_url;
58 |                 doc.summary = doc.text.substring(0, 200);
59 |                 var html = Mustache.to_html(results_template, doc);
60 |                 search_results.insertAdjacentHTML('beforeend', html);
61 |             }
62 |         } else {
63 |             search_results.insertAdjacentHTML('beforeend', "<p>No results found</p>");
64 |         }
65 | 
66 |         if(jQuery){
67 |             /*
68 |              * We currently only automatically hide bootstrap models. This
69 |              * requires jQuery to work.
70 |              */
71 |             jQuery('#mkdocs_search_modal a').click(function(){
72 |                 jQuery('#mkdocs_search_modal').modal('hide');
73 |             })
74 |         }
75 | 
76 |     };
77 | 
78 |     var search_input = document.getElementById('mkdocs-search-query');
79 | 
80 |     var term = getSearchTerm();
81 |     if (term){
82 |         search_input.value = term;
83 |         search();
84 |     }
85 | 
86 |     search_input.addEventListener("keyup", search);
87 | 
88 | });
89 | 


--------------------------------------------------------------------------------
/site/css/theme_extra.css:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tweak the overal size to better match RTD.
  3 |  */
  4 | body {
  5 |     font-size: 90%;
  6 | }
  7 | 
  8 | h3, h4, h5, h6 {
  9 |     color: #2980b9;
 10 |     font-weight: 300
 11 | }
 12 | 
 13 | /*
 14 |  * Sphinx doesn't have support for section dividers like we do in
 15 |  * MkDocs, this styles the section titles in the nav
 16 |  *
 17 |  * https://github.com/mkdocs/mkdocs/issues/175
 18 |  */
 19 | .wy-menu-vertical span {
 20 |     line-height: 18px;
 21 |     padding: 0.4045em 1.618em;
 22 |     display: block;
 23 |     position: relative;
 24 |     font-size: 90%;
 25 |     color: #838383;
 26 | }
 27 | 
 28 | .wy-menu-vertical .subnav a {
 29 |     padding: 0.4045em 2.427em;
 30 | }
 31 | 
 32 | /*
 33 |  * Long navigations run off the bottom of the screen as the nav
 34 |  * area doesn't scroll.
 35 |  *
 36 |  * https://github.com/mkdocs/mkdocs/pull/202
 37 |  */
 38 | .wy-nav-side {
 39 |     height: 100%;
 40 |     overflow-y: auto;
 41 | }
 42 | 
 43 | /*
 44 |  * readthedocs theme hides nav items when the window height is
 45 |  * too small to contain them.
 46 |  *
 47 |  * https://github.com/mkdocs/mkdocs/issues/#348
 48 |  */
 49 | .wy-menu-vertical ul {
 50 |   margin-bottom: 2em;
 51 | }
 52 | 
 53 | /*
 54 |  * Fix wrapping in the code highlighting
 55 |  *
 56 |  * https://github.com/mkdocs/mkdocs/issues/233
 57 |  */
 58 | code {
 59 |     white-space: pre;
 60 | }
 61 | 
 62 | /*
 63 |  * Wrap inline code samples otherwise they shoot of the side and
 64 |  * can't be read at all.
 65 |  *
 66 |  * https://github.com/mkdocs/mkdocs/issues/313
 67 |  */
 68 | p code {
 69 |     word-wrap: break-word;
 70 | }
 71 | 
 72 | /*
 73 |  * The CSS classes from highlight.js seem to clash with the
 74 |  * ReadTheDocs theme causing some code to be incorrectly made
 75 |  * bold and italic.
 76 |  *
 77 |  * https://github.com/mkdocs/mkdocs/issues/411
 78 |  */
 79 | code.cs, code.c {
 80 |     font-weight: inherit;
 81 |     font-style: inherit;
 82 | }
 83 | 
 84 | /*
 85 |  * Fix some issues with the theme and non-highlighted code
 86 |  * samples. Without and highlighting styles attached the
 87 |  * formatting is broken.
 88 |  *
 89 |  * https://github.com/mkdocs/mkdocs/issues/319
 90 |  */
 91 | .no-highlight {
 92 |   display: block;
 93 |   padding: 0.5em;
 94 |   color: #333;
 95 | }
 96 | 
 97 | 
 98 | /*
 99 |  * Additions specific to the search functionality provided by MkDocs
100 |  */
101 | 
102 | #mkdocs-search-results article h3
103 | {
104 |     margin-top: 23px;
105 |     border-top: 1px solid #E1E4E5;
106 |     padding-top: 24px;
107 | }
108 | 
109 | #mkdocs-search-results article:first-child h3 {
110 |     border-top: none;
111 | }
112 | 
113 | #mkdocs-search-query{
114 |     width: 100%;
115 |     border-radius: 50px;
116 |     padding: 6px 12px;
117 |     border-color: #D1D4D5;
118 | }
119 | 
120 | .wy-menu-vertical li ul {
121 |     display: inherit;
122 | }
123 | 
124 | .wy-menu-vertical li ul.subnav ul.subnav{
125 |     padding-left: 1em;
126 | }
127 | 


--------------------------------------------------------------------------------
/8_cleaning/completed/clean_csv_done.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import csv
 4 | import re
 5 | 
 6 | orig_file_name = 'rahm_spending.csv'
 7 | read_file = open(orig_file_name, 'rb')
 8 | csv_reader = csv.reader(read_file)
 9 | 
10 | headers = csv_reader.next()
11 | 
12 | # ['LASTONLYNAME', 'FIRSTNAME', 'EXPENDEDDATE', 'AMOUNT', 'ADDRESS1',
13 | # 'ADDRESS2', 'CITY', 'STATE', 'ZIP', 'EXPTYPE', 'PURPOSE', 'BENEFICIARY']
14 | 
15 | # Let's just go ahead and write a function for this.
16 | 
17 | 
18 | def cleaner(row):
19 |     # Let's worry about the columns with problems.
20 |     # LASTONLYNAME needs to be uppercase.
21 |     lastonlyname = row[0].upper()
22 |     # AMOUNT suffers from whitespace and dollar signs
23 |     amount = float(row[3].replace('$', '').strip())
24 |     # CITY contains some problematic spellings of 'Chicago'
25 |     # and non-breaking spaces for display (&NBSP;)
26 |     if row[6] in ['CHGO', 'CHCAGO']:
27 |         city = 'CHICAGO'
28 |     else:
29 |         city = row[6].replace('&NBSP;', ' ')
30 |     # ZIP has leading zeros removed
31 |     if len(row[8]) == 4:
32 |         zip = '0{}'.format(row[8])
33 |     else:
34 |         zip = row[8]
35 |     # One thing with PURPOSE: there's additional detail after
36 |     # a - or /. We can use regex to specify multiple split criteria.
37 |     p_split = re.split('-|/', row[10])
38 |     # Have to set an if/else up for if it doesn't have one of these.
39 |     if len(p_split) > 1:
40 |         main_purpose = p_split[0].strip()
41 |         purpose_extra = p_split[1].strip()
42 |     else:
43 |         main_purpose = row[10]
44 |         purpose_extra = ''
45 |     # There are also synonymous words present: 'fee,' 'fees,'
46 |     # 'cost,' 'costs,' 'expense.' Replacing this with 'expenses'
47 |     # will go a long way toward cleaner categories.
48 |     problem_words = ['FEE', 'FEES', 'COST', 'COSTS', 'EXPENSE']
49 |     purpose_words = main_purpose.split()
50 |     for word in purpose_words:
51 |         if word in problem_words:
52 |             loc = purpose_words.index(word)
53 |             purpose_words.pop(loc)
54 |             purpose_words.insert(loc, 'EXPENSES')
55 |     main_purpose = ' '.join(purpose_words)
56 |     # All done; let's return a revised row that contains our fixes and
57 |     # rows we didn't touch.
58 |     cleaned_row = [lastonlyname, row[1], row[2], amount, row[4], row[5], city, row[7], zip, row[9], main_purpose, purpose_extra, row[11]]
59 |     return cleaned_row
60 | 
61 | # Our file is loaded and ready to go. We have a cleaning function. Let's fix the
62 | # headers to match our file changes before we loose the function on the file.
63 | 
64 | headers.insert(headers.index('PURPOSE') + 1, 'DETAIL')
65 | 
66 | clean_file_name = 'rahm_spending_clean.csv'
67 | with open(clean_file_name, 'wb') as write_file:
68 |     csv_writer = csv.writer(write_file)
69 |     csv_writer.writerow(headers)
70 |     for row in csv_reader:
71 |         # Here's where we can weed out non-expenditures from hitting our clean file.
72 |         if row[9] == 'EXPENDITURE':
73 |             csv_writer.writerow(cleaner(row))
74 | 
75 | print 'All done!'
76 | read_file.close()
77 | 


--------------------------------------------------------------------------------
/9_other_scrapes/completed/other_scrapes_done.py:
--------------------------------------------------------------------------------
 1 | #! usr/bin/python
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import re
 6 | import time
 7 | 
 8 | # We're not always going to be diving into a table or other information tied
 9 | # up in HTML. Sometimes our target is a file or set of files; we can write the content
10 | # to outfiles in a directory of our choosing.
11 | 
12 | # Fetch the page:
13 | # https://portal.chicagopolice.org/portal/page/portal/ClearPath/News/Crime Statistics
14 | 
15 | url = 'https://portal.chicagopolice.org/portal/page/portal/ClearPath/News/Crime Statistics'
16 | page = requests.get(url)
17 | 
18 | # Process the HTML with BeautifulSoup
19 | 
20 | soup = BeautifulSoup(page.text)
21 | 
22 | # Find all the links on the page (just for practice)
23 | soup.find_all('a')
24 | 
25 | # The links we want don't have great characteristics to hook onto; let's just
26 | # find everything where the link contains '.pdf'
27 | 
28 | pdf_links = soup.find_all(href=re.compile('.pdf'))
29 | 
30 | # Sometimes it's useful to me to count up what I expect to have. We'll take
31 | # a peek here and use the enumerate function to count. When iterating through
32 | # a list it returns a tuple that looks like (<number>, <list item>) instead
33 | # of just <list item>.
34 | 
35 | for link in enumerate(pdf_links):
36 |     print '{0}: {1}'.format(link[0]+1, link[1].string)
37 | 
38 | # Make an empty list to hold the URLs we're going to pull out of all these links.
39 | 
40 | pdf_urls = []
41 | 
42 | # Loop through and grab the URLs.
43 | 
44 | for link in pdf_links:
45 |     pdf_urls.append(link['href'])
46 | 
47 | # Now we have a list of PDF files to grab from the page. We're going to write
48 | # the content in each to a file, and we can use information we've gleaned
49 | # from the file or link to name the files that we're writing to disk.
50 | 
51 | # Let's use a dict instead; the key will be the link text and the value will
52 | # be the URL.
53 | 
54 | pdf_urls_dict = {}
55 | 
56 | for link in pdf_links:
57 |     pdf_urls_dict[link.string] = link['href']
58 | 
59 | # We can get the original file name out of the URL by splitting URL at '/' and
60 | # grabbing the last item in the list.
61 | 
62 | name_ex = pdf_urls_dict['District 010'].split('/')[-1]
63 | print name_ex
64 | 
65 | # Since the original file names don't contain any information about the District
66 | # or Area, let's add that in.
67 | 
68 | # Also, let's make sure we're not bombarding the site with a ton of requests at
69 | # the same time; we need to briefly pause between downloads.
70 | 
71 | # We'll pass the URL to a variable, get it with requests, do a little tinkering
72 | # with the PDF name and link name to remove spaces (and the URL equivalent),
73 | # then write them to a directory.
74 | 
75 | for url in pdf_urls_dict:
76 |     pdf = pdf_urls_dict[url]
77 |     file = requests.get(pdf)
78 |     loc_name = url.replace(' ', '')
79 |     pdf_name = pdf.split('/')[-1].replace('%20', '')
80 |     file_name = '{0}-{1}'.format(loc_name, pdf_name)
81 |     # Make a directory to hold these files
82 |     directory = './pdfs/'
83 |     with open(directory+file_name, 'wb') as outfile:
84 |         outfile.write(file.content)
85 |     print 'Grabbed '+loc_name
86 |     time.sleep(1)
87 | 


--------------------------------------------------------------------------------
/2_web_scrape/completed/fun_with_regex_done.py:
--------------------------------------------------------------------------------
 1 | # Regular expressions are a powerful tool for pattern matching when you
 2 | # know the general format of what you're trying to find but want to keep
 3 | # it loose in terms of actual content: think finding email addresses or
 4 | # phone numbers based on what they have in common with each other. Python
 5 | # has a standard library that deals with it.
 6 | 
 7 | import re
 8 | 
 9 | #
10 | 
11 | records = [
12 |           'April 13, 2013 Cyberdyne Systems $4,000.00 18144 El Camino '
13 |           'Real, Sunnyvale, CA 94087 (408) 555-1234 info@cyberdyne.com '
14 |           'December 2, 2018 December 14, 2018',
15 | 
16 |           'May 4, 2013 Sam Fuzz, Inc. $6,850.50 939 Walnut St, San '
17 |           'Carlos, CA 94070 (408) 555-0304 ceo@samfuzz.net January 28'
18 |           ', 2016 February 15, 2016']
19 | 
20 | # Find the word 'Sunnyvale' in the first record with re.search()
21 | re.search('Sunnyvale', records[0]).group()
22 | 
23 | # Find the first date in the first record. Let's pick apart the pattern:
24 | # 1. \w matches upper/lowercase A-Z and digits 0-9, good for text.
25 | # 2. {3,} matches three or more (shortest possible month is May)
26 | # 3. \s matches whitespace, good for spaces and tabs
27 | # 4. {1} matches exactly one
28 | # 5. \d matches 0-9
29 | # 6. {1,2} matches at least one, but no more than 2
30 | # 7. , matches the comma in the date
31 | # 8. \s{1}: again, one space or tab
32 | # 9. \d{4} matches four digits.
33 | re.search('\w{3,}\s{1}\d{1,2},\s{1}\d{4}', records[0]).group()
34 | 
35 | # Do the same thing but wrap some parentheses around the month, day and year
36 | # patterns and re.search().group(0) to return the whole date.
37 | date_match = re.search('(\w{3,})\s{1}(\d{1,2}),\s{1}(\d{4})', records[0])
38 | date_match.group(0)
39 | 
40 | # Try 1, 2 and 3 to cycle through month, day and year.
41 | date_match.group(1)
42 | date_match.group(2)
43 | date_match.group(3)
44 | 
45 | # Grab all the dates in the first record with re.findall().
46 | all_dates = re.findall('\w{3,}\s{1}\d{1,2},\s{1}\d{4}', records[0])
47 | 
48 | # Print them out with a for loop
49 | for date in all_dates:
50 |     print date
51 | 
52 | # Pick out and print dollar amounts from the records.
53 | # . matches any character, * matches any number of times
54 | for record in records:
55 |     money_match = re.search('\$.*\.\d{2}', record)
56 |     print money_match.group()
57 | 
58 | # Try to do the same thing for the phone numbers.
59 | for record in records:
60 |     ph_match = re.search('\(\d{3}\)\s\d{3}-\d{4}', record)
61 |     print ph_match.group()
62 | 
63 | # How would I isolate something like a company name that's totally variable?
64 | # Think about the hooks you have on either side; the pattern you want to
65 | # match here has to do with what's around it.
66 | for record in records:
67 |     company_match = re.search('\d{4}\s(.+)\s\$', record)
68 |     print company_match.group(1)
69 | 
70 | # We can also substitute based on a pattern. Give everyone an '.info'
71 | # email address via print and re.sub().
72 | for record in records:
73 |     print re.sub('\.\w{3}', '.info', record)
74 | 
75 | # If you have multiple character possibilities that act as delimiters for a
76 | # string you want to break apart, re.split() can come in handy.
77 | my_list = ['OCT-2010', 'NOV/2011', 'FEB 2012', 'MAR/2012']
78 | for item in my_list:
79 |     print re.split('-|/|\s', item)
80 | 


--------------------------------------------------------------------------------
/8_cleaning/completed/excel_done.py:
--------------------------------------------------------------------------------
 1 | #! usr/bin/python
 2 | 
 3 | from openpyxl import load_workbook
 4 | import shutil
 5 | 
 6 | # We're going to be messing with an existing file, so let's clone our Excel
 7 | # file just to be on the safe side.
 8 | 
 9 | shutil.copyfile('Candidates.xlsx', 'Candidates_WORKING.xlsx')
10 | 
11 | # Load our Excel workbook into memory
12 | loc = 'Candidates_WORKING.xlsx'
13 | wb = load_workbook(filename=loc)
14 | 
15 | # List the sheets we find in the workbook
16 | for sheet in wb.sheetnames:
17 |     print sheet
18 | 
19 | # Active sheet will default to the first; we can also select it
20 | sheet_read = wb.get_sheet_by_name('Sheet1')
21 | 
22 | # We can iterate through rows an do things based on the pattern that
23 | # we find. First, though, we need to decide where to put the data.
24 | # Why not another blank sheet in the same book?
25 | 
26 | sheet_write = wb.get_sheet_by_name('Sheet2')
27 | 
28 | # Right now there's nothing on the sheet; the max row and column would be '1'
29 | sheet_write.max_row
30 | sheet_write.max_column
31 | 
32 | # Max_row can help us keep track of where we are as we write to the book.
33 | # Let's start by writing a header for all the information we're going to
34 | # collect.
35 | column_names = ['Race', 'Candidate Name', 'Filing Date', 'Street Address', 'City', 'State', 'Zip', 'Phone Number', 'Email Address']
36 | 
37 | for header in column_names:
38 |     sheet_write.cell(column=column_names.index(header) + 1, row=1, value=header)
39 | 
40 | # This is our first modification to the workbook; let's save our changes.
41 | wb.save(filename=loc)
42 | 
43 | # We need something outside the loop to hold onto the race name
44 | 
45 | race_holder = ''
46 | 
47 | # Let's walk through each row in our sheet with .rows
48 | for ws_row in sheet_read.rows:
49 |     # Headers are duplicated for each race in this sheet; we can grab the
50 |     # race name in the cell directly above and hold onto it IF we encounter
51 |     # that first header item, "Candidate's Name."
52 |     if ws_row[0].value == "Candidate's Name":
53 |         cell_above = ws_row[0].column + str(ws_row[0].row - 1)
54 |         race_holder = sheet_read[cell_above].value
55 |     # The other condition to hunt for: Candidate info is spread out over two
56 |     # rows. If there's something in the first and second columns, that means
57 |     # it's a candidate, and we can go to work on that row and the row below it.
58 |     elif ws_row[0].value is not None and ws_row[1].value is not None:
59 |         row_below = ws_row[0].row + 1
60 |         cand_name = ws_row[0].value
61 |         st_addr = ws_row[1].value
62 |         phone = ws_row[3].value
63 |         filed_date = ws_row[4].value
64 |         city_st_zip = sheet_read['B'+str(row_below)].value
65 |         # Let's break apart the combined city, state and zip
66 |         city = city_st_zip.split(',')[0]
67 |         st_zip = city_st_zip.split(',')[1].split()
68 |         st = st_zip[0]
69 |         zip = st_zip[1]
70 |         email = sheet_read['D'+str(row_below)].value
71 |         # Let's write the row to Sheet2, which we've already queued up. Watch
72 |         # out for the date, which we'll reformat from a Python date object.
73 |         row = [race_holder, cand_name, filed_date.strftime('%m/%d/%Y'), st_addr, city, st, zip, phone, email]
74 |         write_row = sheet_write.max_row + 1
75 |         for item in row:
76 |             sheet_write.cell(column=row.index(item) + 1, row=write_row, value=item)
77 | 
78 | # Save changes to the file.
79 | wb.save(filename=loc)
80 | 


--------------------------------------------------------------------------------
/7_out_of_dbs/to_json.py:
--------------------------------------------------------------------------------
  1 | #! usr/bin/python
  2 | 
  3 | # Now that we've processed JSON, moved into into a database an set up some helpful 
  4 | # prewritten queries for new records, we're going to talk about the flip side of that: 
  5 | # exporting data to JSON for use elsewhere.
  6 | 
  7 | # PROBLEM: We have some cool crime data and have set up the automated analysis
  8 | # of weekly records for interesting trends we may want to keep regular tabs
  9 | # on. It all comes with coordinates, though; why don't we use that to our
 10 | # advantage and generate some stuff we can actually see?
 11 | 
 12 | # HOW WE'RE GOING TO DEAL WITH IT:
 13 | #   - Query our database, crawl through the rows of results to make a GeoJSON
 14 | #     file of points from scratch for display when connected to a Leaflet map.
 15 | #   - Query our database again while going through an existing GeoJSON and
 16 | #     adding data to it, that way we can display the result.
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | # Connect to the crime.db we made earlier and grab all the homicide records.
 23 | 
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # The GeoJSON format for points we'll need to write. Don't worry, we'll break it down!
 34 | #
 35 | # { "type": "FeatureCollection",
 36 | # 	"features": [
 37 | # 		{ "type": "Feature",
 38 | # 		"geometry": {"type": "Point", "coordinates": [<LONG_X>, <LAT_Y>]},
 39 | #         "properties": {"<PROPNAME>": "<PROPVALUE>"}
 40 | #         },
 41 | # 		{ "type": "Feature",
 42 | #         <...>
 43 | #         }
 44 | #     ]
 45 | # }
 46 | 
 47 | # As we've learned, the dict data type doesn't have an order. Fortunately,
 48 | # there's an OrderedDict object we can pull out here to keep our GeoJSON
 49 | # structured properly.
 50 | 
 51 | 
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 	
 65 | # Check the format to make sure it's looking like we expect:
 66 | # print json.dumps(homicide_json, indent=4)
 67 | 
 68 | # Let's open a file and write all of it. Because JavaScript is a weird beast,
 69 | # we need to prefix our GeoJSON output with a "var <whatever> ="
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 	
 75 | 
 76 | # The next step's a little more complicated; we're going to parse an existing 
 77 | # GeoJSON file that shows all of Chicago's communities and add a property 
 78 | # to it. 
 79 | 
 80 | # First thing to do is load the GeoJSON
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 	
 86 | # Let's also summon that long query from last time that calculates a violent 
 87 | # crime rate from the same data. Slight tweak: we're adding the column with
 88 | # community ID numbers.
 89 | 
 90 | # viol_rate_sql = '''SELECT chicago_areas.comm_id, chicago_areas.comm_name, ROUND((crime_query.violent_crimes*1.0/chicago_areas.pop2010) * 10000,2) as rate
 91 | #          FROM (
 92 | #                SELECT community_area, COUNT(*) AS violent_crimes
 93 | #                FROM week
 94 | #                WHERE primary_type in ('HOMICIDE', 'CRIM SEXUAL ASSAULT', 'ROBBERY', 'ASSAULT', 'BATTERY')
 95 | #                GROUP BY 1
 96 | #                ) as crime_query, chicago_areas
 97 | #          WHERE crime_query.community_area = chicago_areas.comm_id
 98 | #          ORDER BY 3 DESC'''
 99 |          
100 | 
101 | 
102 | 
103 | # Now we need to walk through the list of 'features' in our GeoJSON file, checking
104 | # the community ID number against the community IDs in our violent crime list. If
105 | # they match, we're going to insert a property called 'VC_RATE.'
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | # Close the database when we're all done. 	
118 | 
119 | 


--------------------------------------------------------------------------------
/6_from_apis/API.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # For more information about this particular API, Socrata, the vendor Chicago
  4 | # uses for many of its publicly-facing data sources, has excellent
  5 | # documentation to read:
  6 | # http://dev.socrata.com/foundry/#/data.cityofchicago.org/6zsd-86xi
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | # Let's write a function that check the date of the most recent crime in the JSON and
 15 | # returns it. We'll have it convert the date it finds into a datetime object Python
 16 | # can do some math on.
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | # We're going to have to convert dates back and forth between strings a bit. Better to
 27 | # just go ahead and whip up a short function for it.
 28 | 
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # Let's write another designed to grab the most recent week of crime from Chicago's API.
 34 | # We'll set it up so it can take the date from our date_check function.
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | 
 43 | 
 44 | # With those three functions in place, one short line of code will toss the most recent
 45 | # week's worth of crimes into a variable for us.
 46 | 
 47 | 
 48 | # Let's do some quick checks in the interpreter to see what our data looks like:
 49 |             # return the first record
 50 |             # see how many records we received
 51 | 
 52 | # Are all the records we're getting back the same length? This will be important when we
 53 | # kick the result over to a database.
 54 | 
 55 | 
 56 | 
 57 |                       # set is a data type that only holds uniques
 58 | 
 59 | # Let's see what one of these shorter crime records looks like.
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | # Open a connection to a SQLite database and create a cursor we'll use to interact with
 66 | # said database. (If one doesn't exist, it'll be created on the spot.)
 67 | 
 68 | 
 69 | 
 70 | 
 71 | # We want to make a table, so let's get a list of fields that need to be in there. We
 72 | # won't put "location" in, because it's redundant (we already have latitude and longitude,
 73 | # not to mention Illinois state plane coordinates in feet).
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | # Let's write a quick function to figure out if the table already exists in our database
 85 | # or not. If we query a table that doesn't exist, we'll get an error; let's use that to
 86 | # our advantage.
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | # Start if
 98 | 
 99 | 
100 |     # So now we have to write a SQL statement that will insert values into the right fields,
101 |     # regardless of how long the field is. To make this happen, we're also going to need to
102 |     # deal with some dict order weirdness by specifying fields for our inserted values.
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 |         # SQL format: INSERT INTO <table> (<col1>, <col2>, ...) VALUES ('<val1>', '<val2>', ...)
111 | 
112 | 
113 | 
114 | 
115 | 
116 | # Some basic queries based on the data.
117 | 
118 | # A function to assess the week's crimes.
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | # And a function to format the result of high_crime_areas!
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | # This could be set up to run some quick summaries as soon as the data is processed by
145 | # the script.
146 | 
147 | 
148 | 
149 | 
150 | 
151 | # Violent crime rates in each Chicago community for the most recent week of data available
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 


--------------------------------------------------------------------------------
/9_other_scrapes/other_scrapes_post.py:
--------------------------------------------------------------------------------
 1 | #! usr/bin/python
 2 | 
 3 | 
 4 | 
 5 | 
 6 | 
 7 | 
 8 | # Other common situation: ASP.NET used to dynamically refresh a page or deliver
 9 | # data. We can use requests to send the same data payload a browser would when
10 | # we click on a link to download a file.
11 | # http://www.evagoras.com/2011/02/10/how-postback-works-in-asp-net/
12 | 
13 | # https://www.elections.il.gov/CampaignDisclosure/DownloadList.aspx?DownloadListType=Receipts&LastOnlyNameSearchType=Starts+with&LastOnlyName=&FirstNameSearchType=Starts+with&FirstName=&AddressSearchType=Starts+with&Address=&CitySearchType=Starts+with&City=&State=&Zip=&ZipThru=&ContributionType=All+Types&OccupationSearchType=Starts+with&Occupation=&EmployerSearchType=Starts+with&Employer=&VendorLastOnlyNameSearchType=Starts+with&VendorLastOnlyName=&VendorFirstNameSearchType=Starts+with&VendorFirstName=&VendorAddressSearchType=Starts+with&VendorAddress=&VendorCitySearchType=Starts+with&VendorCity=&VendorState=&VendorZip=&VendorZipThru=&OtherReceiptsDescriptionSearchType=&OtherReceiptsDescription=&PurposeState=Starts+with&Purpose=&Amount=&AmountThru=&RcvDate=10%2f22%2f2015&RcvDateThru=10%2f22%2f2015&Archived=false&QueryType=Contrib&LinkedQuery=false&OrderBy=Date+Received+-+most+recent+first
14 | 
15 | # In this case a lot of the query to the campaign finance database is being fed
16 | # through the URL.
17 | 
18 | # Let's get a MM-DD-YYYY for today's date to pass into the URL. We'll replace the
19 | # dates currently in there with {0}.
20 | 
21 | 
22 | 
23 | 
24 | 
25 | # Using Chrome's Developer Tools, we can watch the exchange between the browser
26 | # and the website, than duplicate the response headers that are sent out.
27 | 
28 | # payload = {
29 | #     '__EVENTTARGET': 'ctl00$ContentPlaceHolder1$btnText',
30 | #     '__EVENTARGUMENT': '',
31 | #     '__VIEWSTATE': '/wEPDwULLTEyOTE2NTAxMDUPZBYCZg9kFgICAQ9kFhICAw9kFgICAQ9kFgRmDw8WBB4IQ3NzQ2xhc3MFDmxlZnRNZW51SGVhZGVyHgRfIVNCAgJkZAIBDw8WBB8ABRVsZWZ0TWVudUhlYWRlckNvbnRlbnQfAQICFgIeBXN0eWxlBQ1kaXNwbGF5Om5vbmU7ZAIFD2QWAgIBD2QWBGYPDxYEHwAFDmxlZnRNZW51SGVhZGVyHwECAmRkAgEPDxYEHwAFFWxlZnRNZW51SGVhZGVyQ29udGVudB8BAgIWAh8CBQ1kaXNwbGF5Om5vbmU7ZAIHD2QWAgIBD2QWBGYPDxYEHwAFDmxlZnRNZW51SGVhZGVyHwECAmRkAgEPDxYEHwAFFWxlZnRNZW51SGVhZGVyQ29udGVudB8BAgIWAh8CBQ1kaXNwbGF5Om5vbmU7ZAIJD2QWAgIBD2QWBGYPDxYEHwAFDmxlZnRNZW51SGVhZGVyHwECAmRkAgEPDxYEHwAFFWxlZnRNZW51SGVhZGVyQ29udGVudB8BAgIWAh8CBQ1kaXNwbGF5Om5vbmU7ZAILD2QWAgIBD2QWBGYPDxYEHwAFDmxlZnRNZW51SGVhZGVyHwECAmRkAgEPDxYEHwAFFWxlZnRNZW51SGVhZGVyQ29udGVudB8BAgIWAh8CBQ1kaXNwbGF5Om5vbmU7ZAIND2QWAgIBD2QWBGYPDxYEHwAFDmxlZnRNZW51SGVhZGVyHwECAmRkAgEPDxYEHwAFFWxlZnRNZW51SGVhZGVyQ29udGVudB8BAgIWAh8CBQ1kaXNwbGF5Om5vbmU7ZAIPD2QWAgIBD2QWBGYPDxYEHwAFDmxlZnRNZW51SGVhZGVyHwECAmRkAgEPDxYEHwAFFWxlZnRNZW51SGVhZGVyQ29udGVudB8BAgIWAh8CBQ1kaXNwbGF5Om5vbmU7ZAIXDw8WAh4LTmF2aWdhdGVVcmwFHkVsZWN0aW9uSW5mb3JtYXRpb24uYXNweD9JRD0yM2QWAmYPDxYCHgRUZXh0BRdUdWVzZGF5LCBNYXJjaCAxNSwgMjAxNmRkAj4PFgIeCWlubmVyaHRtbAUNRG93bmxvYWQgTGlzdGRkoRQidhsTSMncnEdVcvP+Xmro05Q=',
32 | #     '__VIEWSTATEGENERATOR': 'C0457661',
33 | #     '__EVENTVALIDATION': '/wEWDAKHkfuRAwKlo/W7AQLQ/PabBgLckKOLAwL3uuXoDAKRh4aQAgLTpeDnCwKbt8nNCAK0ss+BCQL9n+COAgKt7MbzCgLmgNyvCPKcKHXfbl8027eN2h5bYRyzGufA',
34 | #     'ctl00$Accordion1_AccordionExtender_ClientState': -1,
35 | #     'ctl00$Accordion2_AccordionExtender_ClientState': -1,
36 | #     'ctl00$Accordion3_AccordionExtender_ClientState': -1,
37 | #     'ctl00$Accordion4_AccordionExtender_ClientState': -1,
38 | #     'ctl00$Accordion5_AccordionExtender_ClientState': -1,
39 | #     'ctl00$Accordion6_AccordionExtender_ClientState': -1,
40 | #     'ctl00$Accordion7_AccordionExtender_ClientState': -1,
41 | #     'ctl00$mtbSearch': '',
42 | #     'hiddenInputToUpdateATBuffer_CommonToolkitScripts': 1}
43 | 
44 | 
45 | 
46 | # The result that comes back is a text file; we'll just send the contents
47 | # along to an outfile.
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/3_parse_addresses/payday.py:
--------------------------------------------------------------------------------
 1 | # PROBLEM: We have a list of addresses with varying lengths tied up in an HTML page, but it's
 2 | # not in a table. Luckily they follow a pretty predictable format; we need to parse them into
 3 | # different columns and stick them in a delimited file.
 4 | #
 5 | # HOW WE'RE GOING TO DEAL WITH IT:
 6 | #   - Use line breaks to split one big chunk of text into a list of separate addresses
 7 | #   - Use flow control to loop through said list (a 'for' loop)
 8 | #   - Use more flow control in the form of if/elif to send text from each item to the right spots
 9 | #   - Write each list item to a row in a csv after we've tinkered with it
10 | #   - Mop up some other minor issues along the way
11 | 
12 | 
13 | # Let's import the libraries we'll be using for this parsing task: BeautifulSoup from bs4 and
14 | # Python's csv library.
15 | 
16 | 
17 | # Open our HTML file in Python, then make a BeautifulSoup object out of it.
18 | 
19 | 
20 | # Nothing we need is outside of the HTML <body>, which in this case is essentially formatted text
21 | # without HTML tags and hooks we'd use to parse a table.
22 | # Let's go ahead and pass it to a text variable, that way we can go to work on it with Python's
23 | # string functions.
24 | 
25 | # There's an ugly horizontal rule and single line break at the very start; we don't want it there.
26 | 
27 | 
28 | # Between each address, there are two line break tags. That'll be our split point.
29 | 
30 | # There's some cruft in our list; let's slice it out. Skip the first two items and the last item.
31 | 
32 | 
33 | # Let's go ahead and make a new, empty csv file and get a csv.writer object queued up; it will take
34 | # our work below and write it one row at a time to the file.
35 | 
36 | 
37 | # First row in the csv will just be a comma-separated list of field names.
38 | 
39 | 
40 | # Let's begin our 'for' loop here. 
41 | 
42 |     # Just like we used back-to-back <br/> tags to help us split up the text, we can use individual
43 |     # tags to further subdivide the details for each lender. For each lender in the list, we're going
44 |     # to break it apart and turn it into a new list in the details variable.
45 | 
46 |     # The lender name is always going to be the first thing in this new sub-list. We're also using
47 |     # .strip() to get rid of any leading or trailing whitespace.
48 | 
49 |     # The number of items in this new sub-list can be three, four or five, with a twist; sometimes
50 |     # it's four items because 'doing-business-as' info exists, sometimes it's because the street
51 |     # address takes up two lines.
52 |     # If there are three items in details, it has to be the name, the street address and the line
53 |     # with city, state and zip info. We can safely pass an empty string to our placeholders for
54 |     # 'doing-business-as' and a second address line.
55 | 
56 | 
57 | 
58 | 
59 |     # If there are four items and the second starts with 'D/B/A,' parse accordingly. We're converting
60 |     # it to UPPERCASE because this check is case sensitive.
61 | 
62 | 
63 |     # If there is no 'D/B/A,' it must be because of a second street address line.
64 | 
65 | 
66 | 
67 | 
68 |     # Five items in the list? This could also end with 'else,' meaning if it doesn't fit any of the
69 |     # above criteria, just do this instead.
70 | 
71 | 
72 | 
73 | 
74 |     # So let's deal with the last item in the details sub-list, a line of text that has city, state and 
75 |     # zip separated by a comma and a single space. Regardless of how many lines the lender details is,
76 |     # this will always come at the end, so grab it accordingly.
77 | 
78 |     # Broken apart, let's pass the pieces to a few variables.
79 | 
80 | 
81 | 
82 |     # Now that we've grabbed the lender name, figured out whether a 'D/B/A' line exists (as well as 
83 |     # a second address line), and broken apart city, state and zip, let's go ahead and write this
84 |     # line to our csv.
85 | 
86 | 
87 | # We're done writing stuff to our csv. Let's close it to save all of our work.  
88 | 


--------------------------------------------------------------------------------
/docs/pt1.md:
--------------------------------------------------------------------------------
 1 | #### Introduction
 2 | 
 3 | Before we do anything, we need to understand some of the components we'll be interacting with and switching between.
 4 | 
 5 | - **The Python interpreter**: we input commands, define variables and functions, write loops, etc. The interpreter parses it all for us, line by line as we write them, and takes action.
 6 | 
 7 | - **Text editor**: this is where we essentially chain a bunch of commands together within a single document and save it as a Python script (ending in *.py*).
 8 | 
 9 | - **The command line**: Accessed via "Terminal" on OS X or "cmd.exe" (or possibly PowerShell) on Windows. It's where we run the Python interpreter, among other programs, and issue commands that allow us to navigate our computer's folders.
10 | 
11 | That's pretty much it. These three pieces give you a container for code you're writing, the program to run it and a method to get it all running. We'll be doing some switching back and forth between entering code directly into the interpreter we run from the command line and the text editor.
12 | 
13 | As we progress, we'll be dealing with two other things.
14 | 
15 | - **External libraries**: Python comes with a standard library that has a ton of useful stuff. Developers have written libraries that take this functionality even further. By importing parts of these libraries into our Python script, we can do complex things, like fetch a web page, with a single line of code.
16 | 
17 | - **A virtual enviroment**: The bubble where all of our code lives.
18 | 
19 | This is a crash course in some of Python's major data types and and how to tame them. We'll be doing things like mashing strings together, running through items in a list and defining very simple functions.
20 | 
21 | We'll be using iPython's interactive interpreter, which means we have access to a few extra features (which iPython modestly calls *magic functions*).
22 | 
23 | The files in this folder:
24 | 
25 | - **var.py**: A Python script with some variables of different types like integers, lists and dictionaries.
26 | 
27 | - **exercises.py**: A file that gives a list of exercises in the form of comments; guidance on how to manipulate the variables in **var.py**. It's also the place we can write code to save for future reference. Nearly everything we do in this task will use iPython.
28 | 
29 | - **fun_with_subs.py**: A file for practicing string substitutions, which is something we'll be doing frequently. It covers the depreciated method you'll encounter frequently and covers ```str.format()```.
30 | 
31 | - **whitespace.py**: A guide to how Python decides which blocks of code to run when; it's all controlled by indentation.
32 | 
33 | Any finished versions will appear in the **completed** folder.
34 | 
35 | Opening iPython is as easy as typing ```ipython``` into PowerShell.
36 | 
37 | To get everything from **var.py** queued up, we can either run it from within iPython using one of the _magic functions_:
38 | 
39 | ```
40 | %run var.py
41 | ```
42 | 
43 | Or we can use the more traditional method of importing everything defined in a script, treating it as a module:
44 | 
45 | ```
46 | from var import *
47 | ```
48 | 
49 | This will make a little more sense as we start bringing in other Python libraries to tackle our other tasks. Either way, it's going to get us those variables defined in **var.py**. We can quickly test that these variables were loaded by typing ```lucky_number```.
50 | 
51 | Let's say I have a longer chunk of code that I want to try out in iPython to see if it works as expected.
52 | 
53 | ```
54 | presents = ['A brand new car', 'Socks']
55 | 
56 | for gift in presents:
57 |     if gift == 'A brand new car':
58 |         print gift, '<- Oh yeah!'
59 |     else:
60 |         print gift, '<- Meh.'
61 | ```
62 | Having to type this line by line into the interpreter and pay attention to indentation can be a pain.
63 | 
64 | But I can write that block of code in a text editor instead, then select it all and copy it. Switching over to iPython, I can use the built-in paste function:
65 | 
66 | ```
67 | %paste
68 | ```
69 | 
70 | This will paste the contents of my clipboard, preserving all the indentation and other white space, and execute whatever code was there. Pretty handy.
71 | 
72 | Finally, iPython has another function that will give you a recap of commands you've typed:
73 | 
74 | ```
75 | %history
76 | ```
77 | 
78 | 


--------------------------------------------------------------------------------
/7_out_of_dbs/completed/to_json_done.py:
--------------------------------------------------------------------------------
  1 | #! usr/bin/python
  2 | 
  3 | # Now that we've processed JSON, moved into into a database an set up some helpful 
  4 | # prewritten queries for new records, we're going to talk about the flip side of that: 
  5 | # exporting data to JSON for use elsewhere.
  6 | 
  7 | # PROBLEM: We have some cool crime data and have set up the automated analysis
  8 | # of weekly records for interesting trends we may want to keep regular tabs
  9 | # on. It all comes with coordinates, though; why don't we use that to our
 10 | # advantage and generate some stuff we can actually see?
 11 | 
 12 | # HOW WE'RE GOING TO DEAL WITH IT:
 13 | #   - Query our database, crawl through the rows of results to make a GeoJSON
 14 | #     file of points from scratch for display when connected to a Leaflet map.
 15 | #   - Query our database again while going through an existing GeoJSON and
 16 | #     adding data to it, that way we can display the result.
 17 | 
 18 | import sqlite3
 19 | import json
 20 | import collections
 21 | 
 22 | # Connect to the crime.db we made earlier and grab all the homicide records.
 23 | db_loc = '../6_from_apis/crime.db'
 24 | conn = sqlite3.connect(db_loc)
 25 | c = conn.cursor()
 26 | 
 27 | c.execute('''SELECT *
 28 |              FROM week
 29 |              WHERE primary_type = "HOMICIDE"''')
 30 |              
 31 | homicides = c.fetchall()
 32 | 
 33 | # The GeoJSON format for points we'll need to write. Don't worry, we'll break it down!
 34 | #
 35 | # { "type": "FeatureCollection",
 36 | # 	"features": [
 37 | # 		{ "type": "Feature",
 38 | # 		"geometry": {"type": "Point", "coordinates": [<LONG_X>, <LAT_Y>]},
 39 | #         "properties": {"<PROPNAME>": "<PROPVALUE>"}
 40 | #         },
 41 | # 		{ "type": "Feature",
 42 | #         <...>
 43 | #         }
 44 | #     ]
 45 | # }
 46 | 
 47 | # As we've learned, the dict data type doesn't have an order. Fortunately,
 48 | # there's an OrderedDict object we can pull out here to keep our GeoJSON
 49 | # structured properly.
 50 | 
 51 | homicide_json = collections.OrderedDict()
 52 | 
 53 | homicide_json['type'] = 'FeatureCollection'
 54 | homicide_json['features'] = []
 55 | 
 56 | for h in homicides:
 57 | 	feat = collections.OrderedDict()
 58 | 	feat['type'] = 'Feature'
 59 | 	feat['geometry'] = collections.OrderedDict()
 60 | 	feat['geometry']['type'] = 'Point'
 61 | 	feat['geometry']['coordinates'] = [float(h[14]), float(h[12])]
 62 | 	feat['properties'] = {'block': h[2], 'location': h[13], 'datetime': h[5]}
 63 | 	homicide_json['features'].append(feat)
 64 | 	
 65 | # Check the format to make sure it's looking like we expect:
 66 | # print json.dumps(homicide_json, indent=4)
 67 | 
 68 | # Let's open a file and write all of it. Because JavaScript is a weird beast,
 69 | # we need to prefix our GeoJSON output with a "var <whatever> ="
 70 | 
 71 | with open('homicide.geojson', 'wb') as outfile:
 72 | 	outfile.write('var homicide_points = ')
 73 | 	json.dump(homicide_json, outfile)
 74 | 	
 75 | 
 76 | # The next step's a little more complicated; we're going to parse an existing 
 77 | # GeoJSON file that shows all of Chicago's communities and add a property 
 78 | # to it. 
 79 | 
 80 | # First thing to do is load the GeoJSON
 81 | 
 82 | geojson_loc = 'comm_areas.geojson'
 83 | with open(geojson_loc, 'rb') as geojson_file:
 84 | 	comm_areas = json.load(geojson_file)
 85 | 	
 86 | # Let's also summon that long query from last time that calculates a violent 
 87 | # crime rate from the same data. Slight tweak: we're adding the column with
 88 | # community ID numbers.
 89 | 
 90 | viol_rate_sql = '''SELECT chicago_areas.comm_id, chicago_areas.comm_name, ROUND((crime_query.violent_crimes*1.0/chicago_areas.pop2010) * 10000,2) as rate
 91 |          FROM (
 92 |                SELECT community_area, COUNT(*) AS violent_crimes
 93 |                FROM week
 94 |                WHERE primary_type in ('HOMICIDE', 'CRIM SEXUAL ASSAULT', 'ROBBERY', 'ASSAULT', 'BATTERY')
 95 |                GROUP BY 1
 96 |                ) as crime_query, chicago_areas
 97 |          WHERE crime_query.community_area = chicago_areas.comm_id
 98 |          ORDER BY 3 DESC'''
 99 |          
100 | c.execute(viol_rate_sql)
101 | viol_rate = c.fetchall()
102 | 
103 | # Now we need to walk through the list of 'features' in our GeoJSON file, checking
104 | # the community ID number against the community IDs in our violent crime list. If
105 | # they match, we're going to insert a property called 'VC_RATE.'
106 | 
107 | for shape in comm_areas['features']:
108 | 	comm_id = int(shape['properties']['AREA_NUMBE'])
109 | 	for loc in viol_rate:
110 | 		if loc[0] == comm_id:
111 | 			shape['properties']['VC_RATE'] = loc[2]
112 | 			
113 | with open('comm_area_plusrate.geojson', 'wb') as outfile:
114 | 	outfile.write('var comm = ')
115 | 	json.dump(comm_areas, outfile)
116 | 
117 | # Close the database when we're all done. 	
118 | conn.close()
119 | 


--------------------------------------------------------------------------------
/9_other_scrapes/completed/other_scrapes_post_done.py:
--------------------------------------------------------------------------------
 1 | #! usr/bin/python
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | from datetime import date
 6 | import re
 7 | 
 8 | # Other common situation: ASP.NET used to dynamically refresh a page or deliver
 9 | # data. We can use requests to send the same data payload a browser would when
10 | # we click on a link to download a file.
11 | # http://www.evagoras.com/2011/02/10/how-postback-works-in-asp-net/
12 | 
13 | # https://www.elections.il.gov/CampaignDisclosure/DownloadList.aspx?DownloadListType=Receipts&LastOnlyNameSearchType=Starts+with&LastOnlyName=&FirstNameSearchType=Starts+with&FirstName=&AddressSearchType=Starts+with&Address=&CitySearchType=Starts+with&City=&State=&Zip=&ZipThru=&ContributionType=All+Types&OccupationSearchType=Starts+with&Occupation=&EmployerSearchType=Starts+with&Employer=&VendorLastOnlyNameSearchType=Starts+with&VendorLastOnlyName=&VendorFirstNameSearchType=Starts+with&VendorFirstName=&VendorAddressSearchType=Starts+with&VendorAddress=&VendorCitySearchType=Starts+with&VendorCity=&VendorState=&VendorZip=&VendorZipThru=&OtherReceiptsDescriptionSearchType=&OtherReceiptsDescription=&PurposeState=Starts+with&Purpose=&Amount=&AmountThru=&RcvDate=10%2f22%2f2015&RcvDateThru=10%2f22%2f2015&Archived=false&QueryType=Contrib&LinkedQuery=false&OrderBy=Date+Received+-+most+recent+first
14 | 
15 | # In this case a lot of the query to the campaign finance database is being fed
16 | # through the URL.
17 | 
18 | # Let's get a MM-DD-YYYY for today's date to pass into the URL. We'll replace the
19 | # dates currently in there with {0}.
20 | 
21 | today_string = date.strftime(date.today(), '%m-%d-%Y')
22 | 
23 | long_url = 'https://www.elections.il.gov/CampaignDisclosure/DownloadList.aspx?DownloadListType=Receipts&LastOnlyNameSearchType=Starts+with&LastOnlyName=&FirstNameSearchType=Starts+with&FirstName=&AddressSearchType=Starts+with&Address=&CitySearchType=Starts+with&City=&State=&Zip=&ZipThru=&ContributionType=All+Types&OccupationSearchType=Starts+with&Occupation=&EmployerSearchType=Starts+with&Employer=&VendorLastOnlyNameSearchType=Starts+with&VendorLastOnlyName=&VendorFirstNameSearchType=Starts+with&VendorFirstName=&VendorAddressSearchType=Starts+with&VendorAddress=&VendorCitySearchType=Starts+with&VendorCity=&VendorState=&VendorZip=&VendorZipThru=&OtherReceiptsDescriptionSearchType=&OtherReceiptsDescription=&PurposeState=Starts+with&Purpose=&Amount=&AmountThru=&RcvDate={0}&RcvDateThru={0}&Archived=false&QueryType=Contrib&LinkedQuery=false&OrderBy=Date+Received+-+most+recent+first'
24 | 
25 | # Using Chrome's Developer Tools, we can watch the exchange between the browser
26 | # and the website, than duplicate the response headers that are sent out.
27 | 
28 | payload = {
29 |     '__EVENTTARGET': 'ctl00$ContentPlaceHolder1$btnText',
30 |     '__EVENTARGUMENT': '',
31 |     '__VIEWSTATE': '/wEPDwULLTEyOTE2NTAxMDUPZBYCZg9kFgICAQ9kFhICAw9kFgICAQ9kFgRmDw8WBB4IQ3NzQ2xhc3MFDmxlZnRNZW51SGVhZGVyHgRfIVNCAgJkZAIBDw8WBB8ABRVsZWZ0TWVudUhlYWRlckNvbnRlbnQfAQICFgIeBXN0eWxlBQ1kaXNwbGF5Om5vbmU7ZAIFD2QWAgIBD2QWBGYPDxYEHwAFDmxlZnRNZW51SGVhZGVyHwECAmRkAgEPDxYEHwAFFWxlZnRNZW51SGVhZGVyQ29udGVudB8BAgIWAh8CBQ1kaXNwbGF5Om5vbmU7ZAIHD2QWAgIBD2QWBGYPDxYEHwAFDmxlZnRNZW51SGVhZGVyHwECAmRkAgEPDxYEHwAFFWxlZnRNZW51SGVhZGVyQ29udGVudB8BAgIWAh8CBQ1kaXNwbGF5Om5vbmU7ZAIJD2QWAgIBD2QWBGYPDxYEHwAFDmxlZnRNZW51SGVhZGVyHwECAmRkAgEPDxYEHwAFFWxlZnRNZW51SGVhZGVyQ29udGVudB8BAgIWAh8CBQ1kaXNwbGF5Om5vbmU7ZAILD2QWAgIBD2QWBGYPDxYEHwAFDmxlZnRNZW51SGVhZGVyHwECAmRkAgEPDxYEHwAFFWxlZnRNZW51SGVhZGVyQ29udGVudB8BAgIWAh8CBQ1kaXNwbGF5Om5vbmU7ZAIND2QWAgIBD2QWBGYPDxYEHwAFDmxlZnRNZW51SGVhZGVyHwECAmRkAgEPDxYEHwAFFWxlZnRNZW51SGVhZGVyQ29udGVudB8BAgIWAh8CBQ1kaXNwbGF5Om5vbmU7ZAIPD2QWAgIBD2QWBGYPDxYEHwAFDmxlZnRNZW51SGVhZGVyHwECAmRkAgEPDxYEHwAFFWxlZnRNZW51SGVhZGVyQ29udGVudB8BAgIWAh8CBQ1kaXNwbGF5Om5vbmU7ZAIXDw8WAh4LTmF2aWdhdGVVcmwFHkVsZWN0aW9uSW5mb3JtYXRpb24uYXNweD9JRD0yM2QWAmYPDxYCHgRUZXh0BRdUdWVzZGF5LCBNYXJjaCAxNSwgMjAxNmRkAj4PFgIeCWlubmVyaHRtbAUNRG93bmxvYWQgTGlzdGRkoRQidhsTSMncnEdVcvP+Xmro05Q=',
32 |     '__VIEWSTATEGENERATOR': 'C0457661',
33 |     '__EVENTVALIDATION': '/wEWDAKHkfuRAwKlo/W7AQLQ/PabBgLckKOLAwL3uuXoDAKRh4aQAgLTpeDnCwKbt8nNCAK0ss+BCQL9n+COAgKt7MbzCgLmgNyvCPKcKHXfbl8027eN2h5bYRyzGufA',
34 |     'ctl00$Accordion1_AccordionExtender_ClientState': -1,
35 |     'ctl00$Accordion2_AccordionExtender_ClientState': -1,
36 |     'ctl00$Accordion3_AccordionExtender_ClientState': -1,
37 |     'ctl00$Accordion4_AccordionExtender_ClientState': -1,
38 |     'ctl00$Accordion5_AccordionExtender_ClientState': -1,
39 |     'ctl00$Accordion6_AccordionExtender_ClientState': -1,
40 |     'ctl00$Accordion7_AccordionExtender_ClientState': -1,
41 |     'ctl00$mtbSearch': '',
42 |     'hiddenInputToUpdateATBuffer_CommonToolkitScripts': 1}
43 | 
44 | page = requests.post(long_url.format(today_string), data=payload)
45 | 
46 | # The result that comes back is a text file; we'll just send the contents
47 | # along to an outfile.
48 | 
49 | with open('contributions-{}.txt'.format(today_string), 'wb') as outfile:
50 |     outfile.write(page.content)
51 | 


--------------------------------------------------------------------------------
/3_parse_addresses/completed/payday_done.py:
--------------------------------------------------------------------------------
 1 | # PROBLEM: We have a list of addresses with varying lengths tied up in an HTML page, but it's
 2 | # not in a table. Luckily they follow a pretty predictable format; we need to parse them into
 3 | # different columns and stick them in a delimited file.
 4 | #
 5 | # HOW WE'RE GOING TO DEAL WITH IT:
 6 | #   - Use line breaks to split one big chunk of text into a list of separate addresses
 7 | #   - Use flow control to loop through said list (a 'for' loop)
 8 | #   - Use more flow control in the form of if/elif to send text from each item to the right spots
 9 | #   - Write each list item to a row in a csv after we've tinkered with it
10 | #   - Mop up some other minor issues along the way
11 | 
12 | 
13 | # Let's import the libraries we'll be using for this parsing task: BeautifulSoup from bs4 and
14 | # Python's csv library.
15 | from bs4 import BeautifulSoup
16 | import csv
17 | 
18 | # Open our HTML file in Python, then make a BeautifulSoup object out of it.
19 | html = open('payday_lenders.html', 'rb')
20 | soup = BeautifulSoup(html)
21 | 
22 | # Nothing we need is outside of the HTML <body>, which in this case is essentially formatted text
23 | # without HTML tags and hooks we'd use to parse a table.
24 | # Let's go ahead and pass it to a text variable, that way we can go to work on it with Python's
25 | # string functions.
26 | block = unicode(soup.body)
27 | # There's an ugly horizontal rule and single line break at the very start; we don't want it there.
28 | block = block.replace('<hr/><br/>', '')
29 | 
30 | # Between each address, there are two line break tags. That'll be our split point.
31 | list = block.split('<br/><br/>')
32 | # There's some cruft in our list; let's slice it out. Skip the first two items and the last item.
33 | list = list[2:len(list)-1]
34 | 
35 | # Let's go ahead and make a new, empty csv file and get a csv.writer object queued up; it will take
36 | # our work below and write it one row at a time to the file.
37 | csv_file = open('payday_lenders.csv', 'wb')
38 | output = csv.writer(csv_file)
39 | # First row in the csv will just be a comma-separated list of field names.
40 | output.writerow(['NAME', 'DBA', 'STADDR', 'STADDR2', 'CITY', 'STATE', 'ZIP'])
41 | 
42 | # Let's begin our 'for' loop here.
43 | for lender in list:
44 |     # Just like we used back-to-back <br/> tags to help us split up the text, we can use individual
45 |     # tags to further subdivide the details for each lender. For each lender in the list, we're going
46 |     # to break it apart and turn it into a new list in the details variable.
47 |     details = lender.split('<br/>')
48 |     # The lender name is always going to be the first thing in this new sub-list. We're also using
49 |     # .strip() to get rid of any leading or trailing whitespace.
50 |     name = details[0].strip()
51 |     # The number of items in this new sub-list can be three, four or five, with a twist; sometimes
52 |     # it's four items because 'doing-business-as' info exists, sometimes it's because the street
53 |     # address takes up two lines.
54 |     # If there are three items in details, it has to be the name, the street address and the line
55 |     # with city, state and zip info. We can safely pass an empty string to our placeholders for
56 |     # 'doing-business-as' and a second address line.
57 |     if len(details) == 3:
58 |         dba = ''
59 |         staddr = details[1].strip()
60 |         staddr2 = ''
61 |     # If there are four items and the second starts with 'D/B/A,' parse accordingly. We're converting
62 |     # it to UPPERCASE because this check is case sensitive.
63 |     elif len(details) == 4 and details[1].upper().startswith('D/B/A'):
64 |         dba = details[1].strip()
65 |         staddr = details[2].strip()
66 |         staddr2 =''
67 |     # If there is no 'D/B/A,' it must be because of a second street address line.
68 |     elif len(details) == 4 and not details[1].upper().startswith('D/B/A'):
69 |         dba = ''
70 |         staddr = details[1].strip()
71 |         staddr2 = details[2].strip()
72 |     # Five items in the list? This could also end with 'else,' meaning if it doesn't fit any of the
73 |     # above criteria, just do this instead.
74 |     elif len(details) == 5:
75 |         dba = details[1].strip()
76 |         staddr = details[2].strip()
77 |         staddr2 = details[3].strip()
78 |     # So let's deal with the last item in the details sub-list, a line of text that has city, state and 
79 |     # zip separated by a comma and a single space. Regardless of how many lines the lender details is,
80 |     # this will always come at the end, so grab it accordingly.
81 |     citystzip = details[len(details)-1].split(', ')
82 |     # Broken apart, let's pass the pieces to a few variables.
83 |     city = citystzip[0].strip()
84 |     state = citystzip[1].strip()
85 |     zip = citystzip[2].strip()
86 |     # Now that we've grabbed the lender name, figured out whether a 'D/B/A' line exists (as well as 
87 |     # a second address line), and broken apart city, state and zip, let's go ahead and write this
88 |     # line to our csv.
89 |     output.writerow([name, dba, staddr, staddr2, city, state, zip])
90 | 
91 | # We're done writing stuff to our csv. Let's close it to save all of our work.
92 | csv_file.close()
93 | 


--------------------------------------------------------------------------------
/8_cleaning/completed/names_done.py:
--------------------------------------------------------------------------------
  1 | #! usr/bin/python
  2 | 
  3 | import requests
  4 | from bs4 import BeautifulSoup
  5 | import csv
  6 | import re
  7 | import time
  8 | 
  9 | main_url = 'https://webapps1.cityofchicago.org/moboco/org/cityofchicago/moboc/controller/view/start.do'
 10 | 
 11 | page = requests.get(main_url)
 12 | 
 13 | soup = BeautifulSoup(page.content, 'html.parser')
 14 | 
 15 | board_links = soup.find_all('a', href=re.compile('cid='))
 16 | 
 17 | # Quick check to make sure we're getting all of our links before we move on
 18 | for link in board_links:
 19 |     print link.text
 20 | print 'Total number of boards: '+str(len(board_links))
 21 | 
 22 | # Let's look at the URL string
 23 | print board_links[0].get('href')
 24 | print board_links[1].get('href')
 25 | 
 26 | # What we really care about are the numbers at the end of the link; that's
 27 | # how the whole setup for the website works. We feed it an ID, it generates
 28 | # a page.
 29 | 
 30 | board_pages = []
 31 | 
 32 | for link in board_links:
 33 |     board_name = link.string
 34 |     board_url = link.get('href')
 35 |     # We only need the number at the end; we can split the URL and grab it
 36 |     board_code = board_url.split('?cid=')[1]
 37 |     tup = (board_name, board_code)
 38 |     board_pages.append(tup)
 39 | 
 40 | board_url = 'https://webapps1.cityofchicago.org/moboco/org/cityofchicago/moboc/controller/view/searchBoard.do?cid='
 41 | 
 42 | # Just as a test, let's look at the first one to see what we're dealing with.
 43 | 
 44 | test_page = requests.get(board_url+board_pages[0][1])
 45 | soup = BeautifulSoup(test_page.content, 'html.parser')
 46 | 
 47 | # Two tables on each page, both are functionally identical to BeautifulSoup,
 48 | # so we just grab both and take the second.
 49 | 
 50 | page_tables = soup.find_all('table', {'class': 'resultTable'})
 51 | member_table = page_tables[1].find_all('tr')
 52 | 
 53 | # The names are coming from a db that's dynamically combining them; we can use that.
 54 | # All the components are inside of label tags. If we try to extract the text, we're
 55 | # met with a mess of unicode junk (non-breaking spaces, in this case) meant to
 56 | # glue the title, first name, middle initial and last name together.
 57 | 
 58 | first_row = member_table[1]
 59 | 
 60 | # Pull the name and clean it up with split(), which by default will work on
 61 | # whitespace. .join and .split together can be handy for cleaning.
 62 | 
 63 | name_text = first_row.td.text
 64 | clean_name = ' '.join(name_text.split())
 65 | 
 66 | # Depending on how we need the name to be parsed, it might be better for us
 67 | # to chop it up into components.
 68 | 
 69 | name_list = []
 70 | for name_part in first_row.td.find_all('label'):
 71 |     name_list.append(name_part.text)
 72 | 
 73 | # Now that we see how this works, we can write a full run. We'll feed
 74 | # requests.get a page, parse it with BeautifulSoup, and write it to CSV.
 75 | 
 76 | # We can use the 'with' syntax here to open the file
 77 | with open('./boards.csv', 'wb') as csv_outfile:
 78 |     writer = csv.writer(csv_outfile)
 79 |     writer.writerow(['BOARD', 'FULL_NAME', 'TITLE', 'FIRST_NAME', 'MIDDLE', 'LASTNAME', 'SUFFIX', 'TERM', 'APPOINTER'])
 80 |     for board_page in board_pages:
 81 |         # We can basically use what we've done above to zero in on the table
 82 |         page = requests.get(board_url+board_page[1])
 83 |         soup = BeautifulSoup(page.content, 'html.parser')
 84 |         page_tables = soup.find_all('table', {'class': 'resultTable'})
 85 |         member_table = page_tables[1].find_all('tr')
 86 |         # Another for loop to move inside each row (except the header)
 87 |         for row in member_table[1:]:
 88 |             cells = row.find_all('td')
 89 |             name_cell = cells[0]
 90 |             term_cell = cells[1]
 91 |             appt_cell = cells[2]
 92 |             full_name = ' '.join(name_cell.text.split())
 93 |             # If it's a vacancy, let's not go through the trouble of splitting.
 94 |             if full_name == 'Vacancy':
 95 |                 title, first_name, mi, last_name, suffix = '', '', '', '', ''
 96 |             else:
 97 |                 # Slot the different components here
 98 |                 name_parts = []
 99 |                 for part in name_cell.find_all('label'):
100 |                     name_parts.append(part.text)
101 |                 title = name_parts[0]
102 |                 first_name = name_parts[1]
103 |                 mi = name_parts[2]
104 |                 # Last name is currently including suffixes like Jr, Sr, III, etc.
105 |                 # We can look for a comma that denotes the suffix.
106 |                 if name_parts[3].find(',') == -1:
107 |                     last_name = name_parts[3]
108 |                     suffix = ''
109 |                 else:
110 |                     last_split = name_parts[3].split(', ')
111 |                     last_name = last_split[0]
112 |                     suffix = last_split[1]
113 |             term = term_cell.text.strip()
114 |             appt = appt_cell.text.strip()
115 |             writer.writerow([board_page[0], full_name, title, first_name, mi, last_name, suffix, term, appt])
116 |         print 'Fetched and wrote '+board_page[0]
117 |         time.sleep(2)
118 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Coding for Journalists
 2 | 
 3 | This class is an evolving introduction to coding and the Python programming language for journalists. In addition to a tour of the fundamentals, it spans nine basic projects to get you started.
 4 | 
 5 | The [first version](https://github.com/ireapps/coding-for-journalists/tree/ire2015) of this course happened at the [2015 IRE Conference in Philadelphia](https://ire.org/conferences/ire-2015/); the repository and associated tasks have been updated since.
 6 | 
 7 | A few things to note before we get rolling:
 8 | 
 9 | * This guide and all documentation live at [coding-for-journalists.rtfd.org](http://coding-for-journalists.rtfd.org)
10 | * The GitHub repository that includes all code is available from [github.com/ireapps/coding-for-journalists](https://github.com/ireapps/coding-for-journalists)
11 | * A good place to raise issues with the code or ask specific questions about the code is [also on GitHub](https://github.com/ireapps/coding-for-journalists/issues)
12 | * [Email](mailto:alex@ire.org) IRE Training Director Alex Richards, the primary author of this course, or [contact him on Twitter](http://www.twitter.com/alexrichards)
13 | 
14 | We have set aside time at the end of the workshop to work through the process of getting your home or work laptop set up with Python and a development environment. If you're looking for [a guide](install.md), we've put one together that covers most of what you'll need for OS X and Windows.
15 | 
16 | Each task we'll tackle and set of lessons has finished and working versions of the code in the "completed" folder; they typically have "_done" appended to the end of the filename. If there isn't enough time to hit each one during our time together, there's enough commenting in place that you should be able to work through them on your own (and feel free to bug [Alex](mailto:alex@ire.org)).
17 | 
18 | This is designed for people who have some grounding in data journalism already and experience with spreadsheets and database managers. It's helpful if you understand Excel functions, for example, and some basic SQL. Other important skills include navigation of the computer's command line; we'll bemoving between folders, running scripts and issuing commands to a Python interpreter.
19 | 
20 | ## What the next few days have in store
21 | 
22 | #### Introduction
23 | 
24 | A whirlwind tour of Python's data types, variables, basic functionality and loops. We'll write a bunch of them on our own, discuss them, and then run a script in the iPython interactive interpreter to load a variety of variables and then start messing with them.
25 | 
26 | We'll also talk about how to format strings — which we'll be doing a ton — and how Python deals with whitespace.
27 | 
28 | #### Scraping data from the web
29 | 
30 | Fetching data from the web was one of programming's original beachheads in the journalism world. We're going to grab a data table from a website and turn it into a delimited text file to use in a spreadsheet or database manager.
31 | 
32 | This is a task one could arguably accomplish using Excel's "import from web" feature, so we're going to take it one step further: we'll alter our script to drill into additional detail pages and automatically extract more columns of data for our text file.
33 | 
34 | We'll also drill into the finer points of using the BeautifulSoup library to parse HTML, reading and writing CSV files, and targeting data with regular expressions.
35 | 
36 | #### Parsing records that fall across multiple lines
37 | 
38 | Data don't always arrive in a nice, neat table. We have hundreds of addresses to parse into a format that will work in Excel or a database manager, and Python is going to help us make it happen. We'll write a script that dices it all and puts everything in its proper place -- one line for one location.
39 | 
40 | #### Making a reusable function
41 | 
42 | In this quick project, we'll modify the address parsing script we wrote in the previous exercise, turning it into a reusable function that we can apply to future address lists that arrive in the same problematic format.
43 | 
44 | #### Geocoding with Python
45 | 
46 | Geocoding is one of those perennial data journalism problems that's gotten easier in some ways over the years, yet harder in others. While having a street address is great, having a latitude and longitude is better. We'll take the poorly formatted addresses we coaxed into a flat file and march them one at a time through an online geocoding service using a handy Python library called [geopy](https://github.com/geopy/geopy).
47 | 
48 | #### Working with APIs and databases
49 | 
50 | Application Programming Interfaces have become a common spigot for data on the web. We'll tap into one maintained by the city of Chicago that deals with crime and send it to a table in a SQLite database. From there, we'll write some scripted queries to isolate interesting information.
51 | 
52 | We'll also dig in on how Python interacts with databases and how it deals with dates and times.
53 | 
54 | #### Unlocking data stuck in a database 
55 | 
56 | A database is a great reporting tool, having your data and analysis locked up there don't help your audience much. We're going to work with the same crime data we processed earlier and turn it into a web-friendly format (GeoJSON) for automatic display on a very basic Leaflet.js map. 
57 | 
58 | #### The wonderful world of data cleaning
59 | 
60 | Once you figure out what the problems are with a data set, you can outsource the tedious cleaning process to Python. We'll focus on a few different types of cleaning you're likely to encounter in your reporting life, including Excel files where data is scattered around different rows and columns, CSV files with obvious errors that 
61 | 
62 | #### Other kinds of scrapes
63 | 
64 | We're not always after web tables; sometimes we're trying to collect a bunch of files scattered around a website or need to POST some data in order to get a response from a dynamic page, like a government site that uses ASP.NET. 
65 | 
66 | #### The deal with text encoding and debugging your scripts
67 | 
68 | What's ASCII? Why the hell am I getting ```UnicodeEncodeError```? We'll deal sporadically with text encoding in some of these other lessons, but we'll focus on why it's important and what you can do to stay on top of it. In addition, we'll look through common errors you'll run into, what they mean and how to fix them.


--------------------------------------------------------------------------------
/1_intro/exercises.py:
--------------------------------------------------------------------------------
  1 | # ----------------
  2 | # BASIC DATA TYPES
  3 | # ----------------
  4 | 
  5 | # Print your name in the interpreter.
  6 | 
  7 | 
  8 | # Print an integer.
  9 | 
 10 | 
 11 | # Print a basic math equation.
 12 | 
 13 | 
 14 | # Define a string called my_string and wrap it in single quotes.
 15 | 
 16 | 
 17 | # Define another string called my_other_string and wrap it in double quotes.
 18 | 
 19 | 
 20 | # Define an integer (whole number) and call it my_integer.
 21 | 
 22 | 
 23 | # Define a float (number that contains a fraction of one) called my_float.
 24 | 
 25 | 
 26 | # Define a text string in unicode format (we'll see these pop up throughout.
 27 | # the workshop.
 28 | 
 29 | 
 30 | # Define a boolean value (True/False) called my_boolean.
 31 | 
 32 | 
 33 | # Print my_string.
 34 | 
 35 | 
 36 | # Print my_string and my_other_string together.
 37 | 
 38 | 
 39 | # Let's get a space in there.
 40 | 
 41 | 
 42 | # Print my_integer divided by 3.
 43 | 
 44 | 
 45 | # If we don't define one of these as a float, Python 2.7 lops off extra.
 46 | # digits. Try it again with the 3 as a float.
 47 | 
 48 | 
 49 | # Check the data types of some of what we defined above.
 50 | 
 51 | 
 52 | 
 53 | 
 54 | # Print the result of checking whether my_boolean is True and is not True.
 55 | 
 56 | 
 57 | 
 58 | 
 59 | # In iPython, typing the following will load the variables from 'var.py'
 60 | # into the interactive interpreter: %run var.py
 61 | # We can also type: from var import *
 62 | 
 63 | # ----------------
 64 | # FUN WITH NUMBERS
 65 | # ----------------
 66 | 
 67 | # Print the contents of lucky_number
 68 | 
 69 | 
 70 | # Subtract 18 from lucky_number and print it
 71 | 
 72 | 
 73 | # Add six to lucky_number and put it in a variable called unlucky_number
 74 | 
 75 | 
 76 | # Print unlucky_number
 77 | 
 78 | 
 79 | # Set lucky_number to lucky_number plus one; print lucky_number
 80 | 
 81 | 
 82 | 
 83 | # Check to see if lucky_number and unlucky_number are equal and print the result
 84 | 
 85 | 
 86 | # Check to see if lucky_number is less than unlucky_number and print the result
 87 | 
 88 | 
 89 | # Check unlucky_number's type
 90 | 
 91 | 
 92 | # Check the type of unlucky_number added to fuel_2015
 93 | 
 94 | 
 95 | 
 96 | # ----------------
 97 | # FUN WITH STRINGS
 98 | # ----------------
 99 | 
100 | # Print the contents of sentiment
101 | 
102 | 
103 | # Print the length of sentiment
104 | 
105 | 
106 | # Print the length of lucky_number
107 | 
108 | 
109 | # Try printing sentiment as all capital letters
110 | 
111 | 
112 | # In a variable called new_sentiment, put sentiment in all caps again and replace
113 | # 'moderately' with 'extremely'
114 | 
115 | 
116 | # Print the result
117 | 
118 | 
119 | # Print ugly_string, which has too many spaces
120 | 
121 | 
122 | # Try splitting that string apart (defaults to space)
123 | 
124 | 
125 | # Try splitting ugly_string on San
126 | 
127 | 
128 | # Join a series of words together with a space between each and print the result
129 | 
130 | 
131 | # Do the same thing but use Python's join function
132 | 
133 | 
134 | # Split ugly_string apart again based on spaces, then join back together with
135 | # a single space between the words; call it pretty_string
136 | 
137 | 
138 | # Print the string 'apple ' three times
139 | 
140 | 
141 | 
142 | # -----
143 | # LISTS
144 | # -----
145 | 
146 | # Define a list called my_list that contains three strings: Tomato, Celery
147 | # and Carrot
148 | 
149 | 
150 | # Print the list
151 | 
152 | 
153 | # Print the first item in the list
154 | 
155 | 
156 | # Print the second item in the list
157 | 
158 | 
159 | # Add 'Potato' to my_list
160 | 
161 | 
162 | # Print the contents of my_list again
163 | 
164 | 
165 | # ------------
166 | # DICTIONARIES
167 | # ------------
168 | 
169 | # Make a simple dictionary of four items called my_dict:
170 | # class: Python, location: New York, time: 9am, attendance: 20
171 | 
172 | 
173 | 
174 | # Print my_dict.
175 | 
176 | 
177 | # Print the value for location.
178 | 
179 | 
180 | # Print the keys in my_dict.
181 | 
182 | 
183 | # Print the values in my_dict.
184 | 
185 | 
186 | # Check to see if a key 'month' exists in my_dict
187 | 
188 | 
189 | 
190 | # ---------------------------------
191 | # FUN WITH LISTS (AND DICTIONARIES)
192 | # ---------------------------------
193 | 
194 | # Print months
195 | 
196 | 
197 | # Print the length of months
198 | 
199 | 
200 | # Add missing month to list of months; print months again
201 | 
202 | 
203 | 
204 | # Print the first item in the months list
205 | 
206 | 
207 | # Print the third item in the months list
208 | 
209 | 
210 | # Print the last item in the months list
211 | 
212 | 
213 | # Print the third through sixth items; print everything from seven onward
214 | 
215 | 
216 | 
217 | # Print multi_list
218 | 
219 | 
220 | # Print the second item in multi_list's last list
221 | 
222 | 
223 | # Print person_info
224 | 
225 | 
226 | # Print the item linked to first_name in person_info
227 | 
228 | 
229 | # Add Pennsylvania with a key of state to person_info; print the result
230 | 
231 | 
232 | 
233 | # Change city in person_info to Scranton; print the result
234 | 
235 | 
236 | 
237 | 
238 | # ---------------------
239 | # FUN WITH CONTROL FLOW
240 | # ---------------------
241 | 
242 | # Write a for loop that prints each month in the months list
243 | 
244 | 
245 | 
246 | # Get a list of the keys from the person_info dictionary
247 | 
248 | 
249 | 
250 | # Write a for loop that prints the key/value pair in our person_info dictionary
251 | 
252 | 
253 | 
254 | # A for loop that gives a quick summary of each list in multi_list
255 | 
256 | 
257 | 
258 | 
259 | 
260 | 
261 | # An if/else statement that checks the value in lucky_number
262 | 
263 | 
264 | 
265 | 
266 | 
267 | 
268 | # ------------------
269 | # FUN WITH FUNCTIONS
270 | # ------------------
271 | 
272 | # Define a basic function that prints the word 'beans'
273 | 
274 | 
275 | 
276 | # Run the PrintBeans() function.
277 | 
278 | 
279 | # Define another basic function that multiplies a number by itself
280 | 
281 | 
282 | 
283 | 
284 | 
285 | # Find the square of 27.
286 | 
287 | 
288 | # Try finding the square of 'apple.'
289 | 
290 | 
291 | # Let's turn that list summary for loop from earlier into a function
292 | 
293 | 
294 | 
295 | 
296 | 
297 | 
298 | 
299 | 
300 | 
301 | # Append the months list to multi_list; run the ListSummary function on it
302 | 
303 | 
304 | 


--------------------------------------------------------------------------------
/site/search.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
  3 | <!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
  4 | <head>
  5 |   <meta charset="utf-8">
  6 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
  7 |   
  8 |   <title>Coding for Journalists</title>
  9 |   
 10 | 
 11 |   <link rel="shortcut icon" href="./img/favicon.ico">
 12 | 
 13 |   
 14 |   <link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
 15 | 
 16 |   <link rel="stylesheet" href="./css/theme.css" type="text/css" />
 17 |   <link rel="stylesheet" href="./css/theme_extra.css" type="text/css" />
 18 |   <link rel="stylesheet" href="./css/highlight.css">
 19 | 
 20 |   
 21 |   <script src="./js/jquery-2.1.1.min.js"></script>
 22 |   <script src="./js/modernizr-2.8.3.min.js"></script>
 23 |   <script type="text/javascript" src="./js/highlight.pack.js"></script>
 24 |   <script src="./js/theme.js"></script>
 25 |   <script>var base_url = '.';</script>
 26 |   <script data-main="./mkdocs/js/search.js" src="./mkdocs/js/require.js"></script>
 27 | 
 28 | 
 29 |   
 30 | </head>
 31 | 
 32 | <body class="wy-body-for-nav" role="document">
 33 | 
 34 |   <div class="wy-grid-for-nav">
 35 | 
 36 |     
 37 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
 38 |       <div class="wy-side-nav-search">
 39 |         <a href="." class="icon icon-home"> Coding for Journalists</a>
 40 |         <div role="search">
 41 |   <form id ="rtd-search-form" class="wy-form" action="./search.html" method="get">
 42 |     <input type="text" name="q" placeholder="Search docs" />
 43 |   </form>
 44 | </div>
 45 |       </div>
 46 | 
 47 |       <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
 48 |         <ul class="current">
 49 |           
 50 |             <li>
 51 |     <li class="toctree-l1 ">
 52 |         <a class="" href=".">Main</a>
 53 |         
 54 |     </li>
 55 | <li>
 56 |           
 57 |             <li>
 58 |     <li class="toctree-l1 ">
 59 |         <a class="" href="install/">Setting up your computer</a>
 60 |         
 61 |     </li>
 62 | <li>
 63 |           
 64 |             <li>
 65 |     <li class="toctree-l1 ">
 66 |         <a class="" href="pt1/">Introduction</a>
 67 |         
 68 |     </li>
 69 | <li>
 70 |           
 71 |             <li>
 72 |     <li class="toctree-l1 ">
 73 |         <a class="" href="pt2/">Scraping the web</a>
 74 |         
 75 |     </li>
 76 | <li>
 77 |           
 78 |             <li>
 79 |     <li class="toctree-l1 ">
 80 |         <a class="" href="pt3/">Parsing text across lines</a>
 81 |         
 82 |     </li>
 83 | <li>
 84 |           
 85 |             <li>
 86 |     <li class="toctree-l1 ">
 87 |         <a class="" href="pt4/">Making a function</a>
 88 |         
 89 |     </li>
 90 | <li>
 91 |           
 92 |             <li>
 93 |     <li class="toctree-l1 ">
 94 |         <a class="" href="pt5/">Geocoding addresses</a>
 95 |         
 96 |     </li>
 97 | <li>
 98 |           
 99 |             <li>
100 |     <li class="toctree-l1 ">
101 |         <a class="" href="pt6/">APIs and databases</a>
102 |         
103 |     </li>
104 | <li>
105 |           
106 |             <li>
107 |     <li class="toctree-l1 ">
108 |         <a class="" href="pt7/">Unlocking data from databases</a>
109 |         
110 |     </li>
111 | <li>
112 |           
113 |             <li>
114 |     <li class="toctree-l1 ">
115 |         <a class="" href="pt8/">Cleaning data</a>
116 |         
117 |     </li>
118 | <li>
119 |           
120 |             <li>
121 |     <li class="toctree-l1 ">
122 |         <a class="" href="pt9/">Other scrapes</a>
123 |         
124 |     </li>
125 | <li>
126 |           
127 |             <li>
128 |     <li class="toctree-l1 ">
129 |         <a class="" href="pt10/">Text encoding and debugging</a>
130 |         
131 |     </li>
132 | <li>
133 |           
134 |             <li>
135 |     <li class="toctree-l1 ">
136 |         <a class="" href="next_steps/">Next steps</a>
137 |         
138 |     </li>
139 | <li>
140 |           
141 |         </ul>
142 |       </div>
143 |       &nbsp;
144 |     </nav>
145 | 
146 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
147 | 
148 |       
149 |       <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
150 |         <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
151 |         <a href=".">Coding for Journalists</a>
152 |       </nav>
153 | 
154 |       
155 |       <div class="wy-nav-content">
156 |         <div class="rst-content">
157 |           <div role="navigation" aria-label="breadcrumbs navigation">
158 |   <ul class="wy-breadcrumbs">
159 |     <li><a href=".">Docs</a> &raquo;</li>
160 |     
161 |     
162 |     <li class="wy-breadcrumbs-aside">
163 |       
164 |         
165 |           <a href="https://github.com/ireapps/coding-for-journalists/" class="icon icon-github"> Edit on GitHub</a>
166 |         
167 |       
168 |     </li>
169 |   </ul>
170 |   <hr/>
171 | </div>
172 |           <div role="main">
173 |             <div class="section">
174 |               
175 | 
176 |   <h1 id="search">Search Results</h1>
177 | 
178 |   <form id="content_search" action="search.html">
179 |     <span role="status" aria-live="polite" class="ui-helper-hidden-accessible"></span>
180 |     <input name="q" id="mkdocs-search-query" type="text" class="search_input search-query ui-autocomplete-input" placeholder="Search the Docs" autocomplete="off" autofocus>
181 |   </form>
182 | 
183 |   <div id="mkdocs-search-results">
184 |     Sorry, page not found.
185 |   </div>
186 | 
187 | 
188 |             </div>
189 |           </div>
190 |           <footer>
191 |   
192 | 
193 |   <hr/>
194 | 
195 |   <div role="contentinfo">
196 |     <!-- Copyright etc -->
197 |     
198 |       <p>©2015, <a href="http://www.ire.org">Investigative Reporters & Editors, Inc.</a></p>
199 |     
200 |   </div>
201 | 
202 |   Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
203 | </footer>
204 | 	  
205 |         </div>
206 |       </div>
207 | 
208 |     </section>
209 | 
210 |   </div>
211 | 
212 | <div class="rst-versions" role="note" style="cursor: pointer">
213 |     <span class="rst-current-version" data-toggle="rst-current-version">
214 |       
215 |           <a class="icon icon-github" style="float: left; color: #fcfcfc"> GitHub</a>
216 |       
217 |       
218 |       
219 |     </span>
220 | </div>
221 | 
222 | </body>
223 | </html>
224 | 


--------------------------------------------------------------------------------
/6_from_apis/completed/API_done.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # For more information about this particular API, Socrata, the vendor Chicago
  4 | # uses for many of its publicly-facing data sources, has excellent
  5 | # documentation to read:
  6 | # http://dev.socrata.com/foundry/#/data.cityofchicago.org/6zsd-86xi
  7 | 
  8 | import requests
  9 | import sqlite3
 10 | from datetime import datetime, date, timedelta
 11 | 
 12 | crime_url = 'https://data.cityofchicago.org/resource/6zsd-86xi.json'
 13 | 
 14 | # Let's write a function that check the date of the most recent crime in the JSON and
 15 | # returns it. We'll have it convert the date it finds into a datetime object Python
 16 | # can do some math on.
 17 | 
 18 | 
 19 | def date_check():
 20 |     # URL: https://data.cityofchicago.org/resource/6zsd-86xi.json?$limit=1&$order=date DESC
 21 |     r = requests.get('{0}?$limit=1&$order=date DESC'.format(crime_url))
 22 |     most_recent_crime = r.json()
 23 |     date_string = most_recent_crime[0]['date'][:10]
 24 |     return datetime.strptime(date_string, '%Y-%m-%d')
 25 | 
 26 | # We're going to have to convert dates back and forth between strings a bit. Better to
 27 | # just go ahead and whip up a short function for it.
 28 | 
 29 | 
 30 | def date_to_string(dt):
 31 |     return dt.date().isoformat()
 32 | 
 33 | # Let's write another designed to grab the most recent week of crime from Chicago's API.
 34 | # We'll set it up so it can take the date from our date_check function.
 35 | 
 36 | 
 37 | def crime_week(dt):
 38 |     end_date = date_to_string(dt)
 39 |     start_date = date_to_string(dt - timedelta(days=6))
 40 |     query_url = '{0}?$limit=50000&$where=date between \'{1}T00:00:00\' and \'{2}T23:59:59\''.format(crime_url, start_date, end_date)
 41 |     r = requests.get(query_url)
 42 |     return r.json()
 43 | 
 44 | # With those three functions in place, one short line of code will toss the most recent
 45 | # week's worth of crimes into a variable for us.
 46 | week = crime_week(date_check())
 47 | 
 48 | # Let's do some quick checks in the interpreter to see what our data looks like:
 49 | # week[0] # return the first record
 50 | # len(week) # see how many records we received
 51 | 
 52 | # Are all the records we're getting back the same length? This will be important when we
 53 | # kick the result over to a database.
 54 | # lengths = []
 55 | # for rec in week:
 56 | #     lengths.append(len(rec))
 57 | # print set(lengths) # set is a data type that only holds uniques
 58 | 
 59 | # Let's see what one of these shorter crime records looks like.
 60 | # for rec in week:
 61 | #     if len(rec) == 17:
 62 | #         shorter = rec
 63 | #         break
 64 | 
 65 | # Open a connection to a SQLite database and create a cursor we'll use to interact with
 66 | # said database. (If one doesn't exist, it'll be created on the spot.)
 67 | db_loc = 'crime.db'
 68 | conn = sqlite3.connect(db_loc)
 69 | c = conn.cursor()
 70 | 
 71 | # We want to make a table, so let's get a list of fields that need to be in there. We
 72 | # won't put "location" in, because it's redundant (we already have latitude and longitude,
 73 | # not to mention Illinois state plane coordinates in feet).
 74 | fields = []
 75 | 
 76 | for rec in week:
 77 |     if len(rec) == 22:
 78 |         for item in rec.items():
 79 |             fields.append(item[0])
 80 |         fields.remove('location')
 81 |         fields.sort()
 82 |         break
 83 | 
 84 | # Let's write a quick function to figure out if the table already exists in our database
 85 | # or not. If we query a table that doesn't exist, we'll get an error; let's use that to
 86 | # our advantage.
 87 | 
 88 | 
 89 | def table_exist(table_name):
 90 |     try:
 91 |         c.execute("SELECT * FROM {}".format(table_name))
 92 |     except sqlite3.OperationalError:
 93 |         return False
 94 |     else:
 95 |         return True
 96 | 
 97 | if table_exist('week') is False:
 98 |     c.execute("CREATE TABLE week ({})".format(', '.join(fields)))
 99 |     # So now we have to write a SQL statement that will insert values into the right fields,
100 |     # regardless of how long the field is. To make this happen, we're also going to need to
101 |     # deal with some dict order weirdness by specifying fields for our inserted values.
102 |     for rec in week:
103 |         cols = []
104 |         vals = []
105 |         for item in rec:
106 |             if item != 'location':
107 |                 cols.append(item)
108 |                 vals.append(str(rec[item]))
109 |         # SQL format: INSERT INTO <table> (<col1>, <col2>, ...) VALUES ('<val1>', '<val2>', ...)
110 |         c.execute("INSERT INTO week ({0}) VALUES ('{1}')".format(', '.join(cols), "', '".join(vals)))
111 |     conn.commit()
112 | else:
113 |     print 'Table already exists.'
114 | 
115 | # Some basic queries based on the data.
116 | 
117 | # A function to assess the week's crimes.
118 | 
119 | 
120 | def high_crime_areas(main_type, area, top):
121 |     valid_areas = ['district', 'beat', 'community_area', 'ward']
122 |     if area in valid_areas:
123 |         result = [area, main_type]
124 |         c.execute('''SELECT {0}, count(*)
125 |                      FROM week
126 |                      WHERE primary_type = '{1}'
127 |                      GROUP BY 1
128 |                      ORDER BY 2 DESC
129 |                      LIMIT 0,{2}'''.format(area, main_type, top))
130 |         result.append(c.fetchall())
131 |         return result
132 | 
133 | # And a function to format the result of high_crime_areas!
134 | 
135 | 
136 | def show_crimes(result):
137 |     head_string = '{0} BY {1}'.format(result[1], result[0])
138 |     print head_string.upper().replace('_', ' ')
139 |     print '-' * len(head_string)
140 |     for t in result[2]:
141 |         print '{0}: {1}'.format(t[0], t[1])
142 |     print '\n\n'
143 | 
144 | # This could be set up to run some quick summaries as soon as the data is processed by
145 | # the script.
146 | show_crimes(high_crime_areas('HOMICIDE', 'district', 100))
147 | show_crimes(high_crime_areas('NARCOTICS', 'beat', 5))
148 | show_crimes(high_crime_areas('ASSAULT', 'ward', 10))
149 | show_crimes(high_crime_areas('THEFT', 'community_area', 10))
150 | 
151 | # Violent crime rates in each Chicago community for the most recent week of data available
152 | 
153 | sql = '''SELECT chicago_areas.comm_name, ROUND((crime_query.violent_crimes*1.0/chicago_areas.pop2010) * 10000,2) as rate
154 |          FROM (
155 |                SELECT community_area, COUNT(*) AS violent_crimes
156 |                FROM week
157 |                WHERE primary_type in ('HOMICIDE', 'CRIM SEXUAL ASSAULT', 'ROBBERY', 'ASSAULT', 'BATTERY')
158 |                GROUP BY 1
159 |                ) as crime_query, chicago_areas
160 |          WHERE crime_query.community_area = chicago_areas.comm_id
161 |          ORDER BY 2 DESC'''
162 | 
163 | c.execute(sql)
164 | results = c.fetchall()
165 | print 'Violent crime rates by community'
166 | print '-' * 32
167 | for row in results:
168 |     offset = ' ' * (25 - len(row[0]))
169 |     print '{0}:{1}{2}'.format(row[0], offset, row[1])
170 | 


--------------------------------------------------------------------------------
/site/pt4/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
  3 | <!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
  4 | <head>
  5 |   <meta charset="utf-8">
  6 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
  7 |   
  8 |   <title>Making a function - Coding for Journalists</title>
  9 |   
 10 | 
 11 |   <link rel="shortcut icon" href="../img/favicon.ico">
 12 | 
 13 |   
 14 |   <link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
 15 | 
 16 |   <link rel="stylesheet" href="../css/theme.css" type="text/css" />
 17 |   <link rel="stylesheet" href="../css/theme_extra.css" type="text/css" />
 18 |   <link rel="stylesheet" href="../css/highlight.css">
 19 | 
 20 |   
 21 |   <script>
 22 |     // Current page data
 23 |     var mkdocs_page_name = "Making a function";
 24 |   </script>
 25 |   
 26 |   <script src="../js/jquery-2.1.1.min.js"></script>
 27 |   <script src="../js/modernizr-2.8.3.min.js"></script>
 28 |   <script type="text/javascript" src="../js/highlight.pack.js"></script>
 29 |   <script src="../js/theme.js"></script> 
 30 | 
 31 |   
 32 | </head>
 33 | 
 34 | <body class="wy-body-for-nav" role="document">
 35 | 
 36 |   <div class="wy-grid-for-nav">
 37 | 
 38 |     
 39 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
 40 |       <div class="wy-side-nav-search">
 41 |         <a href=".." class="icon icon-home"> Coding for Journalists</a>
 42 |         <div role="search">
 43 |   <form id ="rtd-search-form" class="wy-form" action="../search.html" method="get">
 44 |     <input type="text" name="q" placeholder="Search docs" />
 45 |   </form>
 46 | </div>
 47 |       </div>
 48 | 
 49 |       <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
 50 |         <ul class="current">
 51 |           
 52 |             <li>
 53 |     <li class="toctree-l1 ">
 54 |         <a class="" href="..">Main</a>
 55 |         
 56 |     </li>
 57 | <li>
 58 |           
 59 |             <li>
 60 |     <li class="toctree-l1 ">
 61 |         <a class="" href="../install/">Setting up your computer</a>
 62 |         
 63 |     </li>
 64 | <li>
 65 |           
 66 |             <li>
 67 |     <li class="toctree-l1 ">
 68 |         <a class="" href="../pt1/">Introduction</a>
 69 |         
 70 |     </li>
 71 | <li>
 72 |           
 73 |             <li>
 74 |     <li class="toctree-l1 ">
 75 |         <a class="" href="../pt2/">Scraping the web</a>
 76 |         
 77 |     </li>
 78 | <li>
 79 |           
 80 |             <li>
 81 |     <li class="toctree-l1 ">
 82 |         <a class="" href="../pt3/">Parsing text across lines</a>
 83 |         
 84 |     </li>
 85 | <li>
 86 |           
 87 |             <li>
 88 |     <li class="toctree-l1 current">
 89 |         <a class="current" href="./">Making a function</a>
 90 |         
 91 |             <ul>
 92 |             
 93 |                 <li class="toctree-l3"><a href="#making-a-function">Making a function</a></li>
 94 |                 
 95 |             
 96 |             </ul>
 97 |         
 98 |     </li>
 99 | <li>
100 |           
101 |             <li>
102 |     <li class="toctree-l1 ">
103 |         <a class="" href="../pt5/">Geocoding addresses</a>
104 |         
105 |     </li>
106 | <li>
107 |           
108 |             <li>
109 |     <li class="toctree-l1 ">
110 |         <a class="" href="../pt6/">APIs and databases</a>
111 |         
112 |     </li>
113 | <li>
114 |           
115 |             <li>
116 |     <li class="toctree-l1 ">
117 |         <a class="" href="../pt7/">Unlocking data from databases</a>
118 |         
119 |     </li>
120 | <li>
121 |           
122 |             <li>
123 |     <li class="toctree-l1 ">
124 |         <a class="" href="../pt8/">Cleaning data</a>
125 |         
126 |     </li>
127 | <li>
128 |           
129 |             <li>
130 |     <li class="toctree-l1 ">
131 |         <a class="" href="../pt9/">Other scrapes</a>
132 |         
133 |     </li>
134 | <li>
135 |           
136 |             <li>
137 |     <li class="toctree-l1 ">
138 |         <a class="" href="../pt10/">Text encoding and debugging</a>
139 |         
140 |     </li>
141 | <li>
142 |           
143 |             <li>
144 |     <li class="toctree-l1 ">
145 |         <a class="" href="../next_steps/">Next steps</a>
146 |         
147 |     </li>
148 | <li>
149 |           
150 |         </ul>
151 |       </div>
152 |       &nbsp;
153 |     </nav>
154 | 
155 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
156 | 
157 |       
158 |       <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
159 |         <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
160 |         <a href="..">Coding for Journalists</a>
161 |       </nav>
162 | 
163 |       
164 |       <div class="wy-nav-content">
165 |         <div class="rst-content">
166 |           <div role="navigation" aria-label="breadcrumbs navigation">
167 |   <ul class="wy-breadcrumbs">
168 |     <li><a href="..">Docs</a> &raquo;</li>
169 |     
170 |       
171 |     
172 |     <li>Making a function</li>
173 |     <li class="wy-breadcrumbs-aside">
174 |       
175 |         
176 |           <a href="https://github.com/ireapps/coding-for-journalists/" class="icon icon-github"> Edit on GitHub</a>
177 |         
178 |       
179 |     </li>
180 |   </ul>
181 |   <hr/>
182 | </div>
183 |           <div role="main">
184 |             <div class="section">
185 |               
186 |                 <h4 id="making-a-function">Making a function</h4>
187 | <p>This is a pretty quick task -- the next time we have to deal with a list from the same agency, we shouldn't have to spend time rewriting our code from scratch or even go back to revise it to handle a new file. We have something that works, so let's turn it into a function we can call whenever we need to parse addresses for a list of financial licensees.</p>
188 | <p>This exercise contains the following files:</p>
189 | <p><strong>payday_parser.py</strong>: Our parser from <a href="../pt3/">from the previous exercise</a>. We'll turn the existing work into a function and generalize it a bit to handle a file that's not specifically the list of licensed payday lenders.</p>
190 | <p><strong>consumer_installment.html</strong>: Another listing from the state of Illinois, but this time it encompasses more than 1,000 licensed consumer installment lenders.</p>
191 | <p><strong>call_function.py</strong>: A script we'll write to call the parser function from <strong>payday_parser.py</strong> and direct it toward our HTML file.</p>
192 | <p>Finished versions are in the <strong>completed</strong> folder.</p>
193 |               
194 |             </div>
195 |           </div>
196 |           <footer>
197 |   
198 |     <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
199 |       
200 |         <a href="../pt5/" class="btn btn-neutral float-right" title="Geocoding addresses"/>Next <span class="icon icon-circle-arrow-right"></span></a>
201 |       
202 |       
203 |         <a href="../pt3/" class="btn btn-neutral" title="Parsing text across lines"><span class="icon icon-circle-arrow-left"></span> Previous</a>
204 |       
205 |     </div>
206 |   
207 | 
208 |   <hr/>
209 | 
210 |   <div role="contentinfo">
211 |     <!-- Copyright etc -->
212 |     
213 |       <p>©2015, <a href="http://www.ire.org">Investigative Reporters & Editors, Inc.</a></p>
214 |     
215 |   </div>
216 | 
217 |   Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
218 | </footer>
219 | 	  
220 |         </div>
221 |       </div>
222 | 
223 |     </section>
224 | 
225 |   </div>
226 | 
227 | <div class="rst-versions" role="note" style="cursor: pointer">
228 |     <span class="rst-current-version" data-toggle="rst-current-version">
229 |       
230 |           <a class="icon icon-github" style="float: left; color: #fcfcfc"> GitHub</a>
231 |       
232 |       
233 |         <span><a href="../pt3/" style="color: #fcfcfc;">&laquo; Previous</a></span>
234 |       
235 |       
236 |         <span style="margin-left: 15px"><a href="../pt5/" style="color: #fcfcfc">Next &raquo;</a></span>
237 |       
238 |     </span>
239 | </div>
240 | 
241 | </body>
242 | </html>
243 | 


--------------------------------------------------------------------------------
/site/mkdocs/js/mustache.min.js:
--------------------------------------------------------------------------------
1 | (function(global,factory){if(typeof exports==="object"&&exports){factory(exports)}else if(typeof define==="function"&&define.amd){define(["exports"],factory)}else{factory(global.Mustache={})}})(this,function(mustache){var Object_toString=Object.prototype.toString;var isArray=Array.isArray||function(object){return Object_toString.call(object)==="[object Array]"};function isFunction(object){return typeof object==="function"}function escapeRegExp(string){return string.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g,"\\$&")}var RegExp_test=RegExp.prototype.test;function testRegExp(re,string){return RegExp_test.call(re,string)}var nonSpaceRe=/\S/;function isWhitespace(string){return!testRegExp(nonSpaceRe,string)}var entityMap={"&":"&amp;","<":"&lt;",">":"&gt;",'"':"&quot;","'":"&#39;","/":"&#x2F;"};function escapeHtml(string){return String(string).replace(/[&<>"'\/]/g,function(s){return entityMap[s]})}var whiteRe=/\s*/;var spaceRe=/\s+/;var equalsRe=/\s*=/;var curlyRe=/\s*\}/;var tagRe=/#|\^|\/|>|\{|&|=|!/;function parseTemplate(template,tags){if(!template)return[];var sections=[];var tokens=[];var spaces=[];var hasTag=false;var nonSpace=false;function stripSpace(){if(hasTag&&!nonSpace){while(spaces.length)delete tokens[spaces.pop()]}else{spaces=[]}hasTag=false;nonSpace=false}var openingTagRe,closingTagRe,closingCurlyRe;function compileTags(tags){if(typeof tags==="string")tags=tags.split(spaceRe,2);if(!isArray(tags)||tags.length!==2)throw new Error("Invalid tags: "+tags);openingTagRe=new RegExp(escapeRegExp(tags[0])+"\\s*");closingTagRe=new RegExp("\\s*"+escapeRegExp(tags[1]));closingCurlyRe=new RegExp("\\s*"+escapeRegExp("}"+tags[1]))}compileTags(tags||mustache.tags);var scanner=new Scanner(template);var start,type,value,chr,token,openSection;while(!scanner.eos()){start=scanner.pos;value=scanner.scanUntil(openingTagRe);if(value){for(var i=0,valueLength=value.length;i<valueLength;++i){chr=value.charAt(i);if(isWhitespace(chr)){spaces.push(tokens.length)}else{nonSpace=true}tokens.push(["text",chr,start,start+1]);start+=1;if(chr==="\n")stripSpace()}}if(!scanner.scan(openingTagRe))break;hasTag=true;type=scanner.scan(tagRe)||"name";scanner.scan(whiteRe);if(type==="="){value=scanner.scanUntil(equalsRe);scanner.scan(equalsRe);scanner.scanUntil(closingTagRe)}else if(type==="{"){value=scanner.scanUntil(closingCurlyRe);scanner.scan(curlyRe);scanner.scanUntil(closingTagRe);type="&"}else{value=scanner.scanUntil(closingTagRe)}if(!scanner.scan(closingTagRe))throw new Error("Unclosed tag at "+scanner.pos);token=[type,value,start,scanner.pos];tokens.push(token);if(type==="#"||type==="^"){sections.push(token)}else if(type==="/"){openSection=sections.pop();if(!openSection)throw new Error('Unopened section "'+value+'" at '+start);if(openSection[1]!==value)throw new Error('Unclosed section "'+openSection[1]+'" at '+start)}else if(type==="name"||type==="{"||type==="&"){nonSpace=true}else if(type==="="){compileTags(value)}}openSection=sections.pop();if(openSection)throw new Error('Unclosed section "'+openSection[1]+'" at '+scanner.pos);return nestTokens(squashTokens(tokens))}function squashTokens(tokens){var squashedTokens=[];var token,lastToken;for(var i=0,numTokens=tokens.length;i<numTokens;++i){token=tokens[i];if(token){if(token[0]==="text"&&lastToken&&lastToken[0]==="text"){lastToken[1]+=token[1];lastToken[3]=token[3]}else{squashedTokens.push(token);lastToken=token}}}return squashedTokens}function nestTokens(tokens){var nestedTokens=[];var collector=nestedTokens;var sections=[];var token,section;for(var i=0,numTokens=tokens.length;i<numTokens;++i){token=tokens[i];switch(token[0]){case"#":case"^":collector.push(token);sections.push(token);collector=token[4]=[];break;case"/":section=sections.pop();section[5]=token[2];collector=sections.length>0?sections[sections.length-1][4]:nestedTokens;break;default:collector.push(token)}}return nestedTokens}function Scanner(string){this.string=string;this.tail=string;this.pos=0}Scanner.prototype.eos=function(){return this.tail===""};Scanner.prototype.scan=function(re){var match=this.tail.match(re);if(!match||match.index!==0)return"";var string=match[0];this.tail=this.tail.substring(string.length);this.pos+=string.length;return string};Scanner.prototype.scanUntil=function(re){var index=this.tail.search(re),match;switch(index){case-1:match=this.tail;this.tail="";break;case 0:match="";break;default:match=this.tail.substring(0,index);this.tail=this.tail.substring(index)}this.pos+=match.length;return match};function Context(view,parentContext){this.view=view;this.cache={".":this.view};this.parent=parentContext}Context.prototype.push=function(view){return new Context(view,this)};Context.prototype.lookup=function(name){var cache=this.cache;var value;if(name in cache){value=cache[name]}else{var context=this,names,index,lookupHit=false;while(context){if(name.indexOf(".")>0){value=context.view;names=name.split(".");index=0;while(value!=null&&index<names.length){if(index===names.length-1&&value!=null)lookupHit=typeof value==="object"&&value.hasOwnProperty(names[index]);value=value[names[index++]]}}else if(context.view!=null&&typeof context.view==="object"){value=context.view[name];lookupHit=context.view.hasOwnProperty(name)}if(lookupHit)break;context=context.parent}cache[name]=value}if(isFunction(value))value=value.call(this.view);return value};function Writer(){this.cache={}}Writer.prototype.clearCache=function(){this.cache={}};Writer.prototype.parse=function(template,tags){var cache=this.cache;var tokens=cache[template];if(tokens==null)tokens=cache[template]=parseTemplate(template,tags);return tokens};Writer.prototype.render=function(template,view,partials){var tokens=this.parse(template);var context=view instanceof Context?view:new Context(view);return this.renderTokens(tokens,context,partials,template)};Writer.prototype.renderTokens=function(tokens,context,partials,originalTemplate){var buffer="";var token,symbol,value;for(var i=0,numTokens=tokens.length;i<numTokens;++i){value=undefined;token=tokens[i];symbol=token[0];if(symbol==="#")value=this._renderSection(token,context,partials,originalTemplate);else if(symbol==="^")value=this._renderInverted(token,context,partials,originalTemplate);else if(symbol===">")value=this._renderPartial(token,context,partials,originalTemplate);else if(symbol==="&")value=this._unescapedValue(token,context);else if(symbol==="name")value=this._escapedValue(token,context);else if(symbol==="text")value=this._rawValue(token);if(value!==undefined)buffer+=value}return buffer};Writer.prototype._renderSection=function(token,context,partials,originalTemplate){var self=this;var buffer="";var value=context.lookup(token[1]);function subRender(template){return self.render(template,context,partials)}if(!value)return;if(isArray(value)){for(var j=0,valueLength=value.length;j<valueLength;++j){buffer+=this.renderTokens(token[4],context.push(value[j]),partials,originalTemplate)}}else if(typeof value==="object"||typeof value==="string"||typeof value==="number"){buffer+=this.renderTokens(token[4],context.push(value),partials,originalTemplate)}else if(isFunction(value)){if(typeof originalTemplate!=="string")throw new Error("Cannot use higher-order sections without the original template");value=value.call(context.view,originalTemplate.slice(token[3],token[5]),subRender);if(value!=null)buffer+=value}else{buffer+=this.renderTokens(token[4],context,partials,originalTemplate)}return buffer};Writer.prototype._renderInverted=function(token,context,partials,originalTemplate){var value=context.lookup(token[1]);if(!value||isArray(value)&&value.length===0)return this.renderTokens(token[4],context,partials,originalTemplate)};Writer.prototype._renderPartial=function(token,context,partials){if(!partials)return;var value=isFunction(partials)?partials(token[1]):partials[token[1]];if(value!=null)return this.renderTokens(this.parse(value),context,partials,value)};Writer.prototype._unescapedValue=function(token,context){var value=context.lookup(token[1]);if(value!=null)return value};Writer.prototype._escapedValue=function(token,context){var value=context.lookup(token[1]);if(value!=null)return mustache.escape(value)};Writer.prototype._rawValue=function(token){return token[1]};mustache.name="mustache.js";mustache.version="2.0.0";mustache.tags=["{{","}}"];var defaultWriter=new Writer;mustache.clearCache=function(){return defaultWriter.clearCache()};mustache.parse=function(template,tags){return defaultWriter.parse(template,tags)};mustache.render=function(template,view,partials){return defaultWriter.render(template,view,partials)};mustache.to_html=function(template,view,partials,send){var result=mustache.render(template,view,partials);if(isFunction(send)){send(result)}else{return result}};mustache.escape=escapeHtml;mustache.Scanner=Scanner;mustache.Context=Context;mustache.Writer=Writer});


--------------------------------------------------------------------------------
/site/pt9/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
  3 | <!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
  4 | <head>
  5 |   <meta charset="utf-8">
  6 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
  7 |   
  8 |   <title>Other scrapes - Coding for Journalists</title>
  9 |   
 10 | 
 11 |   <link rel="shortcut icon" href="../img/favicon.ico">
 12 | 
 13 |   
 14 |   <link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
 15 | 
 16 |   <link rel="stylesheet" href="../css/theme.css" type="text/css" />
 17 |   <link rel="stylesheet" href="../css/theme_extra.css" type="text/css" />
 18 |   <link rel="stylesheet" href="../css/highlight.css">
 19 | 
 20 |   
 21 |   <script>
 22 |     // Current page data
 23 |     var mkdocs_page_name = "Other scrapes";
 24 |   </script>
 25 |   
 26 |   <script src="../js/jquery-2.1.1.min.js"></script>
 27 |   <script src="../js/modernizr-2.8.3.min.js"></script>
 28 |   <script type="text/javascript" src="../js/highlight.pack.js"></script>
 29 |   <script src="../js/theme.js"></script> 
 30 | 
 31 |   
 32 | </head>
 33 | 
 34 | <body class="wy-body-for-nav" role="document">
 35 | 
 36 |   <div class="wy-grid-for-nav">
 37 | 
 38 |     
 39 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
 40 |       <div class="wy-side-nav-search">
 41 |         <a href=".." class="icon icon-home"> Coding for Journalists</a>
 42 |         <div role="search">
 43 |   <form id ="rtd-search-form" class="wy-form" action="../search.html" method="get">
 44 |     <input type="text" name="q" placeholder="Search docs" />
 45 |   </form>
 46 | </div>
 47 |       </div>
 48 | 
 49 |       <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
 50 |         <ul class="current">
 51 |           
 52 |             <li>
 53 |     <li class="toctree-l1 ">
 54 |         <a class="" href="..">Main</a>
 55 |         
 56 |     </li>
 57 | <li>
 58 |           
 59 |             <li>
 60 |     <li class="toctree-l1 ">
 61 |         <a class="" href="../install/">Setting up your computer</a>
 62 |         
 63 |     </li>
 64 | <li>
 65 |           
 66 |             <li>
 67 |     <li class="toctree-l1 ">
 68 |         <a class="" href="../pt1/">Introduction</a>
 69 |         
 70 |     </li>
 71 | <li>
 72 |           
 73 |             <li>
 74 |     <li class="toctree-l1 ">
 75 |         <a class="" href="../pt2/">Scraping the web</a>
 76 |         
 77 |     </li>
 78 | <li>
 79 |           
 80 |             <li>
 81 |     <li class="toctree-l1 ">
 82 |         <a class="" href="../pt3/">Parsing text across lines</a>
 83 |         
 84 |     </li>
 85 | <li>
 86 |           
 87 |             <li>
 88 |     <li class="toctree-l1 ">
 89 |         <a class="" href="../pt4/">Making a function</a>
 90 |         
 91 |     </li>
 92 | <li>
 93 |           
 94 |             <li>
 95 |     <li class="toctree-l1 ">
 96 |         <a class="" href="../pt5/">Geocoding addresses</a>
 97 |         
 98 |     </li>
 99 | <li>
100 |           
101 |             <li>
102 |     <li class="toctree-l1 ">
103 |         <a class="" href="../pt6/">APIs and databases</a>
104 |         
105 |     </li>
106 | <li>
107 |           
108 |             <li>
109 |     <li class="toctree-l1 ">
110 |         <a class="" href="../pt7/">Unlocking data from databases</a>
111 |         
112 |     </li>
113 | <li>
114 |           
115 |             <li>
116 |     <li class="toctree-l1 ">
117 |         <a class="" href="../pt8/">Cleaning data</a>
118 |         
119 |     </li>
120 | <li>
121 |           
122 |             <li>
123 |     <li class="toctree-l1 current">
124 |         <a class="current" href="./">Other scrapes</a>
125 |         
126 |             <ul>
127 |             
128 |                 <li class="toctree-l3"><a href="#other-scrapes">Other scrapes</a></li>
129 |                 
130 |             
131 |             </ul>
132 |         
133 |     </li>
134 | <li>
135 |           
136 |             <li>
137 |     <li class="toctree-l1 ">
138 |         <a class="" href="../pt10/">Text encoding and debugging</a>
139 |         
140 |     </li>
141 | <li>
142 |           
143 |             <li>
144 |     <li class="toctree-l1 ">
145 |         <a class="" href="../next_steps/">Next steps</a>
146 |         
147 |     </li>
148 | <li>
149 |           
150 |         </ul>
151 |       </div>
152 |       &nbsp;
153 |     </nav>
154 | 
155 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
156 | 
157 |       
158 |       <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
159 |         <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
160 |         <a href="..">Coding for Journalists</a>
161 |       </nav>
162 | 
163 |       
164 |       <div class="wy-nav-content">
165 |         <div class="rst-content">
166 |           <div role="navigation" aria-label="breadcrumbs navigation">
167 |   <ul class="wy-breadcrumbs">
168 |     <li><a href="..">Docs</a> &raquo;</li>
169 |     
170 |       
171 |     
172 |     <li>Other scrapes</li>
173 |     <li class="wy-breadcrumbs-aside">
174 |       
175 |         
176 |           <a href="https://github.com/ireapps/coding-for-journalists/" class="icon icon-github"> Edit on GitHub</a>
177 |         
178 |       
179 |     </li>
180 |   </ul>
181 |   <hr/>
182 | </div>
183 |           <div role="main">
184 |             <div class="section">
185 |               
186 |                 <h4 id="other-scrapes">Other scrapes</h4>
187 | <p>Scraping can be about more than parsing tables tags in HTML.</p>
188 | <p>The first thing we'll do is collect files from a website with Python.</p>
189 | <p>Then we'll turn out attention to this common problem: <code>javascript:__doPostBack()</code> isn't a link you can follow with a click in Python, but sometimes the data you need is behind it. </p>
190 | <p>If we're trying to collect data from a government website using ASP.NET, we can watch the transaction between our browser and the site unfurl using developer tools like the ones built into Chrome. The requests library isn't just good for fetching URLs — it's full service. It can POST information as well with the intention of getting a response from the site. Based on what we see in terms of exhanged headers, we can copy that information and use requests to send it ourselves.</p>
191 | <p>The files we'll be using:</p>
192 | <ul>
193 | <li>
194 | <p><strong>other_scrapes.py</strong>: This script will collect a set of PDFs. We'll point BeautifulSoup at the page contents to collect the links and then pipe the contents of those links (the PDFs) to files on our machine.</p>
195 | </li>
196 | <li>
197 | <p><strong>other_scrapes_post.py</strong>: We'll use requests to POST data to the Illinois Elections site, causing its ASP.NET framework to give us a tab-delimited text file that contain's the days political contributions. </p>
198 | </li>
199 | </ul>
200 |               
201 |             </div>
202 |           </div>
203 |           <footer>
204 |   
205 |     <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
206 |       
207 |         <a href="../pt10/" class="btn btn-neutral float-right" title="Text encoding and debugging"/>Next <span class="icon icon-circle-arrow-right"></span></a>
208 |       
209 |       
210 |         <a href="../pt8/" class="btn btn-neutral" title="Cleaning data"><span class="icon icon-circle-arrow-left"></span> Previous</a>
211 |       
212 |     </div>
213 |   
214 | 
215 |   <hr/>
216 | 
217 |   <div role="contentinfo">
218 |     <!-- Copyright etc -->
219 |     
220 |       <p>©2015, <a href="http://www.ire.org">Investigative Reporters & Editors, Inc.</a></p>
221 |     
222 |   </div>
223 | 
224 |   Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
225 | </footer>
226 | 	  
227 |         </div>
228 |       </div>
229 | 
230 |     </section>
231 | 
232 |   </div>
233 | 
234 | <div class="rst-versions" role="note" style="cursor: pointer">
235 |     <span class="rst-current-version" data-toggle="rst-current-version">
236 |       
237 |           <a class="icon icon-github" style="float: left; color: #fcfcfc"> GitHub</a>
238 |       
239 |       
240 |         <span><a href="../pt8/" style="color: #fcfcfc;">&laquo; Previous</a></span>
241 |       
242 |       
243 |         <span style="margin-left: 15px"><a href="../pt10/" style="color: #fcfcfc">Next &raquo;</a></span>
244 |       
245 |     </span>
246 | </div>
247 | 
248 | </body>
249 | </html>
250 | 


--------------------------------------------------------------------------------
/1_intro/completed/exercises_done.py:
--------------------------------------------------------------------------------
  1 | # ----------------
  2 | # BASIC DATA TYPES
  3 | # ----------------
  4 | 
  5 | # Print your name in the interpreter.
  6 | print 'Jane Smith'
  7 | 
  8 | # Print an integer.
  9 | print 8
 10 | 
 11 | # Print a basic math equation.
 12 | print 2 + 2
 13 | 
 14 | # Define a string called my_string and wrap it in single quotes.
 15 | my_string = 'This is a string.'
 16 | 
 17 | # Define another string called my_other_string and wrap it in double quotes.
 18 | my_other_string = "This is also a string."
 19 | 
 20 | # Define an integer (whole number) and call it my_integer.
 21 | my_integer = 14
 22 | 
 23 | # Define a float (number that contains a fraction of one) called my_float.
 24 | my_float = 5.45
 25 | 
 26 | # Define a text string in unicode format (we'll see these pop up throughout.
 27 | # the workshop.
 28 | my_unicode_string = u'ASCII is holding me back — you know it\'s true.'
 29 | 
 30 | # Define a boolean value (True/False) called my_boolean.
 31 | my_boolean = True
 32 | 
 33 | # Print my_string.
 34 | print my_string
 35 | 
 36 | # Print my_string and my_other_string together.
 37 | print my_string + my_other_string
 38 | 
 39 | # Let's get a space in there.
 40 | print my_string + ' ' + my_other_string
 41 | 
 42 | # Print my_integer divided by 3.
 43 | print my_integer / 3
 44 | 
 45 | # If we don't define one of these as a float, Python 2.7 lops off extra.
 46 | # digits. Try it again with the 3 as a float.
 47 | print my_integer / float(3)
 48 | 
 49 | # Check the data types of some of what we defined above.
 50 | print type(my_integer)
 51 | print type(my_string)
 52 | print type(my_boolean)
 53 | 
 54 | # Print the result of checking whether my_boolean is True and is not True.
 55 | print my_boolean is True
 56 | print my_boolean is not True
 57 | 
 58 | 
 59 | # In iPython, typing the following will load the variables from 'var.py'
 60 | # into the interactive interpreter: %run var.py
 61 | # We can also type: from var import *
 62 | 
 63 | # ----------------
 64 | # FUN WITH NUMBERS
 65 | # ----------------
 66 | 
 67 | # Print the contents of lucky_number
 68 | print lucky_number
 69 | 
 70 | # Subtract 18 from lucky_number and print it
 71 | print lucky_number - 18
 72 | 
 73 | # Add six to lucky_number and put it in a variable called unlucky_number
 74 | unlucky_number = lucky_number + 6
 75 | 
 76 | # Print unlucky_number
 77 | print unlucky_number
 78 | 
 79 | # Set lucky_number to lucky_number plus one; print lucky_number
 80 | lucky_number = lucky_number + 1
 81 | print lucky_number
 82 | 
 83 | # Check to see if lucky_number and unlucky_number are equal and print the result
 84 | print lucky_number == unlucky_number
 85 | 
 86 | # Check to see if lucky_number is less than unlucky_number and print the result
 87 | print lucky_number < unlucky_number
 88 | 
 89 | # Check unlucky_number's type
 90 | type(unlucky_number)
 91 | 
 92 | # Check the type of unlucky_number added to fuel_2015
 93 | type(unlucky_number + fuel_2015)
 94 | 
 95 | 
 96 | # ----------------
 97 | # FUN WITH STRINGS
 98 | # ----------------
 99 | 
100 | # Print the contents of sentiment
101 | print sentiment
102 | 
103 | # Print the length of sentiment
104 | print len(sentiment)
105 | 
106 | # Print the length of lucky_number
107 | print len(lucky_number)
108 | 
109 | # Try printing sentiment as all capital letters
110 | print sentiment.upper()
111 | 
112 | # In a variable called new_sentiment, put sentiment in all caps again and replace
113 | # 'moderately' with 'extremely'
114 | new_sentiment = sentiment.upper().replace('MODERATELY', 'EXTREMELY')
115 | 
116 | # Print the result
117 | print new_sentiment
118 | 
119 | # Print ugly_string, which has too many spaces
120 | print ugly_string
121 | 
122 | # Try splitting that string apart (defaults to space)
123 | ugly_string.split()
124 | 
125 | # Try splitting ugly_string on San
126 | ugly_string.split('San')
127 | 
128 | # Join a series of words together with a space between each and print the result
129 | print 'a'+' '+'series'+' '+'of'+' '+'words'
130 | 
131 | # Do the same thing but use Python's join function
132 | print ' '.join(['a', 'series', 'of', 'words'])
133 | 
134 | # Split ugly_string apart again based on spaces, then join back together with
135 | # a single space between the words; call it pretty_string
136 | pretty_string = ' '.join(ugly_string.split())
137 | 
138 | # Print the string 'apple ' three times
139 | print 'apple ' * 3
140 | 
141 | 
142 | # -----
143 | # LISTS
144 | # -----
145 | 
146 | # Define a list called my_list that contains three strings: Tomato, Celery
147 | # and Carrot
148 | my_list = ['Tomato', 'Celery', 'Carrot']
149 | 
150 | # Print the list
151 | print my_list
152 | 
153 | # Print the first item in the list
154 | print my_list[0]
155 | 
156 | # Print the second item in the list
157 | print my_list[1]
158 | 
159 | # Add 'Potato' to my_list
160 | my_list.append('Potato')
161 | 
162 | # Print the contents of my_list again
163 | print my_list
164 | 
165 | # ------------
166 | # DICTIONARIES
167 | # ------------
168 | 
169 | # Make a simple dictionary of four items called my_dict:
170 | # class: Python, location: New York, time: 9am, attendance: 20
171 | 
172 | my_dict = {'class': 'Python', 'location': 'New York', 'time': '9am', 'attendance': 20}
173 | 
174 | # Print my_dict.
175 | print my_dict
176 | 
177 | # Print the value for location.
178 | print my_dict['location']
179 | 
180 | # Print the keys in my_dict.
181 | print my_dict.keys()
182 | 
183 | # Print the values in my_dict.
184 | print my_dict.values()
185 | 
186 | # Check to see if a key 'month' exists in my_dict
187 | print 'month' in my_dict
188 | 
189 | 
190 | # ---------------------------------
191 | # FUN WITH LISTS (AND DICTIONARIES)
192 | # ---------------------------------
193 | 
194 | # Print months
195 | print months
196 | 
197 | # Print the length of months
198 | print len(months)
199 | 
200 | # Add missing month to list of months; print months again
201 | months.append('Dec')
202 | print months
203 | 
204 | # Print the first item in the months list
205 | print months[0]
206 | 
207 | # Print the third item in the months list
208 | print months[2]
209 | 
210 | # Print the last item in the months list
211 | print months[-1]
212 | 
213 | # Print the third through sixth items; print everything from seven onward
214 | print months[2:6]
215 | print months[6:]
216 | 
217 | # Print multi_list
218 | print multi_list
219 | 
220 | # Print the second item in multi_list's last list
221 | print multi_list[-1][1]
222 | 
223 | # Print person_info
224 | print person_info
225 | 
226 | # Print the item linked to first_name in person_info
227 | print person_info['first_name']
228 | 
229 | # Add Pennsylvania with a key of state to person_info; print the result
230 | person_info['state'] = 'Pennsylvania'
231 | print person_info
232 | 
233 | # Change city in person_info to Scranton; print the result
234 | person_info['city'] = 'Scranton'
235 | print person_info
236 | 
237 | 
238 | # ---------------------
239 | # FUN WITH CONTROL FLOW
240 | # ---------------------
241 | 
242 | # Write a for loop that prints each month in the months list
243 | for month in months:
244 |     print month
245 | 
246 | # Get a list of the keys from the person_info dictionary
247 | for key in person_info:
248 |     print key
249 | 
250 | # Write a for loop that prints the key/value pair in our person_info dictionary
251 | for key in person_info:
252 |     print 'The key is '+key+' and the value is '+person_info[key]
253 | 
254 | # A for loop that gives a quick summary of each list in multi_list
255 | for sublist in multi_list:
256 |     print 'This list has', len(sublist), 'items:'
257 |     for item in sublist:
258 |         print item
259 |     print '\n'
260 | 
261 | # An if/else statement that checks the value in lucky_number
262 | if lucky_number == 7:
263 |     print 'Still seven!'
264 | else:
265 |     print 'Not seven anymore.'
266 | 
267 | 
268 | # ------------------
269 | # FUN WITH FUNCTIONS
270 | # ------------------
271 | 
272 | # Define a basic function that prints the word 'beans'
273 | def PrintBeans():
274 |     print 'Beans'
275 | 
276 | # Run the PrintBeans() function.
277 | PrintBeans()
278 | 
279 | # Define another basic function that multiplies a number by itself
280 | 
281 | 
282 | def Square(number):
283 |     print number * number
284 | 
285 | # Find the square of 27.
286 | Square(27)
287 | 
288 | # Try finding the square of 'apple.'
289 | Square('apple')
290 | 
291 | # Let's turn that list summary for loop from earlier into a function
292 | 
293 | 
294 | def ListSummary(list):
295 |     for sublist in list:
296 |         print 'This list has', len(sublist), 'items:'
297 |         for item in sublist:
298 |             print item
299 |         print '\n'
300 | 
301 | # Append the months list to multi_list; run the ListSummary function on it
302 | multi_list.append(months)
303 | ListSummary(multi_list)
304 | 


--------------------------------------------------------------------------------
/site/pt3/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
  3 | <!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
  4 | <head>
  5 |   <meta charset="utf-8">
  6 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
  7 |   
  8 |   <title>Parsing text across lines - Coding for Journalists</title>
  9 |   
 10 | 
 11 |   <link rel="shortcut icon" href="../img/favicon.ico">
 12 | 
 13 |   
 14 |   <link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
 15 | 
 16 |   <link rel="stylesheet" href="../css/theme.css" type="text/css" />
 17 |   <link rel="stylesheet" href="../css/theme_extra.css" type="text/css" />
 18 |   <link rel="stylesheet" href="../css/highlight.css">
 19 | 
 20 |   
 21 |   <script>
 22 |     // Current page data
 23 |     var mkdocs_page_name = "Parsing text across lines";
 24 |   </script>
 25 |   
 26 |   <script src="../js/jquery-2.1.1.min.js"></script>
 27 |   <script src="../js/modernizr-2.8.3.min.js"></script>
 28 |   <script type="text/javascript" src="../js/highlight.pack.js"></script>
 29 |   <script src="../js/theme.js"></script> 
 30 | 
 31 |   
 32 | </head>
 33 | 
 34 | <body class="wy-body-for-nav" role="document">
 35 | 
 36 |   <div class="wy-grid-for-nav">
 37 | 
 38 |     
 39 |     <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
 40 |       <div class="wy-side-nav-search">
 41 |         <a href=".." class="icon icon-home"> Coding for Journalists</a>
 42 |         <div role="search">
 43 |   <form id ="rtd-search-form" class="wy-form" action="../search.html" method="get">
 44 |     <input type="text" name="q" placeholder="Search docs" />
 45 |   </form>
 46 | </div>
 47 |       </div>
 48 | 
 49 |       <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
 50 |         <ul class="current">
 51 |           
 52 |             <li>
 53 |     <li class="toctree-l1 ">
 54 |         <a class="" href="..">Main</a>
 55 |         
 56 |     </li>
 57 | <li>
 58 |           
 59 |             <li>
 60 |     <li class="toctree-l1 ">
 61 |         <a class="" href="../install/">Setting up your computer</a>
 62 |         
 63 |     </li>
 64 | <li>
 65 |           
 66 |             <li>
 67 |     <li class="toctree-l1 ">
 68 |         <a class="" href="../pt1/">Introduction</a>
 69 |         
 70 |     </li>
 71 | <li>
 72 |           
 73 |             <li>
 74 |     <li class="toctree-l1 ">
 75 |         <a class="" href="../pt2/">Scraping the web</a>
 76 |         
 77 |     </li>
 78 | <li>
 79 |           
 80 |             <li>
 81 |     <li class="toctree-l1 current">
 82 |         <a class="current" href="./">Parsing text across lines</a>
 83 |         
 84 |             <ul>
 85 |             
 86 |                 <li class="toctree-l3"><a href="#parsing-text-across-lines">Parsing text across lines</a></li>
 87 |                 
 88 |             
 89 |             </ul>
 90 |         
 91 |     </li>
 92 | <li>
 93 |           
 94 |             <li>
 95 |     <li class="toctree-l1 ">
 96 |         <a class="" href="../pt4/">Making a function</a>
 97 |         
 98 |     </li>
 99 | <li>
100 |           
101 |             <li>
102 |     <li class="toctree-l1 ">
103 |         <a class="" href="../pt5/">Geocoding addresses</a>
104 |         
105 |     </li>
106 | <li>
107 |           
108 |             <li>
109 |     <li class="toctree-l1 ">
110 |         <a class="" href="../pt6/">APIs and databases</a>
111 |         
112 |     </li>
113 | <li>
114 |           
115 |             <li>
116 |     <li class="toctree-l1 ">
117 |         <a class="" href="../pt7/">Unlocking data from databases</a>
118 |         
119 |     </li>
120 | <li>
121 |           
122 |             <li>
123 |     <li class="toctree-l1 ">
124 |         <a class="" href="../pt8/">Cleaning data</a>
125 |         
126 |     </li>
127 | <li>
128 |           
129 |             <li>
130 |     <li class="toctree-l1 ">
131 |         <a class="" href="../pt9/">Other scrapes</a>
132 |         
133 |     </li>
134 | <li>
135 |           
136 |             <li>
137 |     <li class="toctree-l1 ">
138 |         <a class="" href="../pt10/">Text encoding and debugging</a>
139 |         
140 |     </li>
141 | <li>
142 |           
143 |             <li>
144 |     <li class="toctree-l1 ">
145 |         <a class="" href="../next_steps/">Next steps</a>
146 |         
147 |     </li>
148 | <li>
149 |           
150 |         </ul>
151 |       </div>
152 |       &nbsp;
153 |     </nav>
154 | 
155 |     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
156 | 
157 |       
158 |       <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
159 |         <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
160 |         <a href="..">Coding for Journalists</a>
161 |       </nav>
162 | 
163 |       
164 |       <div class="wy-nav-content">
165 |         <div class="rst-content">
166 |           <div role="navigation" aria-label="breadcrumbs navigation">
167 |   <ul class="wy-breadcrumbs">
168 |     <li><a href="..">Docs</a> &raquo;</li>
169 |     
170 |       
171 |     
172 |     <li>Parsing text across lines</li>
173 |     <li class="wy-breadcrumbs-aside">
174 |       
175 |         
176 |           <a href="https://github.com/ireapps/coding-for-journalists/" class="icon icon-github"> Edit on GitHub</a>
177 |         
178 |       
179 |     </li>
180 |   </ul>
181 |   <hr/>
182 | </div>
183 |           <div role="main">
184 |             <div class="section">
185 |               
186 |                 <h4 id="parsing-text-across-lines">Parsing text across lines</h4>
187 | <p>It would be great if every piece of data you came across was in a format that lent itself to easy capture. In the same vein as the text extraction from reactor detail pages in our last example, we're going to pick apart an HTML file of licensed payday lenders (that's mostly text) and turn it into a flat CSV file where one row is one record.</p>
188 | <p>In this file, addresses can span three, four or five lines. Sometimes it's on four lines because a lender does business in Illinois under another name; in others, it's because the lender operates out of a suite, room or building stored on a line separate from the street address. This means that our script needs to behave four different ways depending on how many lines it encounters for each address, and we'll switch among those behaviors with <code>if/elif</code> syntax.</p>
189 | <p>We'll again use <code>BeautifulSoup</code>, but primarily to break out the portion of the file we want to capture for the resulting CSV.</p>
190 | <p>This exercise has the following files:</p>
191 | <ul>
192 | <li>
193 | <p><strong>payday.py</strong>: The file we'll use to write our address parser, following the comments.</p>
194 | </li>
195 | <li>
196 | <p><strong>payday_lenders.html</strong>: A simple HTML file that lists nearly 500 payday lenders licensed to do business in Illinois. Their addresses are split across multiple lines.</p>
197 | </li>
198 | </ul>
199 | <p>As with other exercises, finished versions are in the <strong>completed</strong> folder.</p>
200 |               
201 |             </div>
202 |           </div>
203 |           <footer>
204 |   
205 |     <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
206 |       
207 |         <a href="../pt4/" class="btn btn-neutral float-right" title="Making a function"/>Next <span class="icon icon-circle-arrow-right"></span></a>
208 |       
209 |       
210 |         <a href="../pt2/" class="btn btn-neutral" title="Scraping the web"><span class="icon icon-circle-arrow-left"></span> Previous</a>
211 |       
212 |     </div>
213 |   
214 | 
215 |   <hr/>
216 | 
217 |   <div role="contentinfo">
218 |     <!-- Copyright etc -->
219 |     
220 |       <p>©2015, <a href="http://www.ire.org">Investigative Reporters & Editors, Inc.</a></p>
221 |     
222 |   </div>
223 | 
224 |   Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
225 | </footer>
226 | 	  
227 |         </div>
228 |       </div>
229 | 
230 |     </section>
231 | 
232 |   </div>
233 | 
234 | <div class="rst-versions" role="note" style="cursor: pointer">
235 |     <span class="rst-current-version" data-toggle="rst-current-version">
236 |       
237 |           <a class="icon icon-github" style="float: left; color: #fcfcfc"> GitHub</a>
238 |       
239 |       
240 |         <span><a href="../pt2/" style="color: #fcfcfc;">&laquo; Previous</a></span>
241 |       
242 |       
243 |         <span style="margin-left: 15px"><a href="../pt4/" style="color: #fcfcfc">Next &raquo;</a></span>
244 |       
245 |     </span>
246 | </div>
247 | 
248 | </body>
249 | </html>
250 | 


--------------------------------------------------------------------------------